### Train BERT from Scratch using Transformers in Python
* https://www.thepythoncode.com/article/pretraining-bert-huggingface-transformers-in-python
* https://huggingface.co/transformers/v3.2.0/training.html

In [1]:
import os
import json
import re
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import *
from tokenizers import *
from datasets import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import nltk
from nltk.data import load
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Loading the Model

In [2]:
model_path = "c09k_pretrained_bert_512_2"
tokenizer_path = 'c09k_pretrained_bert'

### model load

In [3]:
from transformers import AdamW

In [4]:
# load the model checkpoint
model1 = BertForSequenceClassification.from_pretrained(os.path.join(model_path, "checkpoint-9000"), return_dict=True, num_labels=18)
# load the tokenizer
tokenizer1 = BertTokenizerFast.from_pretrained(tokenizer_path, vocab_size=8000, local_files_only=True)
optimizer = AdamW(model1.parameters(), lr=1e-5)
# optimizer = A AdamW(model1.parameters(), lr=1e-5)

loading configuration file c09k_pretrained_bert_512_2/checkpoint-9000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512/checkpoint-15000",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LABE

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json




In [5]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model1.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model1.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

### finetuning data prepare
* 성능 비교를 목적으로 KoBERT의 학습 데이터를 그대로 사용

In [6]:
from torch.nn import functional as F

In [7]:
train_data_df = pd.read_csv('data/train_C09K11_220715.txt', sep='\t')  # text와 라벨 파일
test_dataset_df = pd.read_csv('data/test_C09K11_220715.txt', sep='\t')
train_dataset = Dataset.from_pandas(train_data_df)  # Dataset 객체 생성
test_dataset = Dataset.from_pandas(test_dataset_df)
finetune_dataset = DatasetDict()  # DatasetDict 객체 생성
finetune_dataset['train'] = train_dataset
finetune_dataset['test'] = test_dataset
finetune_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9881
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5203
    })
})

In [8]:
encoding = tokenizer1(train_dataset['text'][:16], return_tensors='pt', padding=True, truncation=True, max_length=512)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [9]:
train_data_df[:3]

Unnamed: 0,text,label
0,광활성 형광체 프로브 및 이를 이용한 암세포 검출방법,1
1,"본 발명은 광활성 형광체 검출방법에 관한 것으로서, 화학식 1로 표시되는 화합물...",1
2,하기 [화학식 1]로 표시되는 OPA 또는 TPA 구조체를 포함하는 활용한 광활성 ...,1


In [10]:
num_warmup_steps = 2
n_epochs = 5
num_train_steps = n_epochs + 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

In [11]:
# 토크나이저 함수 객체 생성
def preprocess_function(examples):
    return tokenizer1(examples["text"], truncation=True, max_length=512, padding=True)
def preprocess_function1(examples):
    return examples['label']

In [12]:
tokenized_finetune_dataset = finetune_dataset.map(preprocess_function, batched=True)  
# DatasetDict의 'text'를 토크나이징, finetune_dataset에는 features: ['text', 'label']만 있었으나,
# tokenized_finetune_dataset에는 'input_ids', 'token_type_ids', 'attention_mask'가 추가됨

100%|██████████| 10/10 [00:01<00:00,  6.31ba/s]
100%|██████████| 6/6 [00:00<00:00,  6.96ba/s]


In [13]:
tokenized_finetune_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9881
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5203
    })
})

In [14]:
# tokenized_finetune_dataset['train']

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer1)

In [24]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [25]:
# training_args = TrainingArguments(output_dir="c09k_finetuned_bert")
training_args = TrainingArguments(
    output_dir="c09k_finetuned_bert_512",
    evaluation_strategy="epoch",    # 'steps': evaluate each `logging_steps`, 'epoch'  : each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
trainer1 = Trainer(
    model=model1,
    args=training_args,
    train_dataset=tokenized_finetune_dataset['train'],
    eval_dataset=tokenized_finetune_dataset['test'],
#     compute_metrics=compute_metrics,
#     tokenizer=tokenizer1,
    data_collator=data_collator,
)

In [19]:
trainer1.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9881
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6180


Epoch,Training Loss,Validation Loss
1,1.9587,2.505204
2,1.6296,2.509042
3,1.4355,2.55106
4,1.3232,2.662175
5,1.2264,2.673572


Saving model checkpoint to c09k_finetuned_bert_512/checkpoint-1000
Configuration saved in c09k_finetuned_bert_512/checkpoint-1000/config.json
Model weights saved in c09k_finetuned_bert_512/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5203
  Batch size = 8
Saving model checkpoint to c09k_finetuned_bert_512/checkpoint-2000
Configuration saved in c09k_finetuned_bert_512/checkpoint-2000/config.json
Model weights saved in c09k_finetuned_bert_512/checkpoint-2000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you 

TrainOutput(global_step=6180, training_loss=1.5340524642598667, metrics={'train_runtime': 3368.677, 'train_samples_per_second': 14.666, 'train_steps_per_second': 1.835, 'total_flos': 1.300086909471744e+16, 'train_loss': 1.5340524642598667, 'epoch': 5.0})

### finetuning된 모델 성능 평가

In [34]:
# load the model checkpoint
model2 = BertForSequenceClassification.from_pretrained(
    os.path.join('c09k_finetuned_bert_512', "checkpoint-3000"),
    return_dict=True, num_labels=18)

loading configuration file c09k_finetuned_bert_512/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 1

In [35]:
# load the tokenizer
tokenizer2 = BertTokenizerFast.from_pretrained(tokenizer_path, vocab_size=8000, local_files_only=True)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file c09k_pretrained_bert/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model_max_length": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token": "[PAD]",
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "sep_token": "[SEP]",
  "transformers_version": "4.22.0.dev0",
  "type_vocab_size": 2,
  "unk_token": "[UNK]",
  "use_cache": true,
  "vocab_size": 30522
}



vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


In [36]:
tokenizer2.vocab_size

8000

In [37]:
optimizer = AdamW(model1.parameters(), lr=1e-5)



In [38]:
# ?pipeline
text_classifier = pipeline('text-classification', model=model2, tokenizer=tokenizer2)

In [39]:
test_output = trainer1.predict(test_dataset=tokenized_finetune_dataset['test'])

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5203
  Batch size = 8


In [57]:
y_pred = np.argmax(test_output.predictions, axis=1)

In [58]:
y_true = np.array(test_dataset['label'])
y_true

array([0, 0, 0, ..., 9, 9, 9])

In [42]:
test_output.label_ids == test_label

array([ True,  True,  True, ...,  True,  True,  True])

In [43]:
result_df = pd.DataFrame()
result_df['label'] = test_label
result_df['pred'] = pred
result_df['input'] = np.array(test_dataset['text'])

In [44]:
with open('data/c09k_label_ind.pickle', 'rb') as f:
    label_ind = pickle.load(f)
ind_label = {v:k for k, v in label_ind.items()}
# ind_label = pickle.loads('data/c09k_ind_label.pickle', )
# label_ind = pickle.loads('data/c09k_label_ind.pickle')

In [45]:
result_df2 = result_df.replace({'label': ind_label, 'pred': ind_label})
result_df2.to_csv('data/finetuned_predict_result_c09k.csv', encoding='utf-8', index=False)

In [70]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, classification_report

In [69]:
?accuracy_score

In [72]:
print(classification_report(y_true=y_true, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        99
           1       0.00      0.00      0.00        72
           2       0.00      0.00      0.00        70
           3       0.05      0.01      0.01       691
           4       0.10      0.08      0.09       469
           5       0.10      0.09      0.09       435
           6       0.11      0.11      0.11       463
           7       0.00      0.00      0.00       589
           8       0.03      0.02      0.02       356
           9       0.14      0.01      0.03       568
          10       0.04      0.04      0.04       296
          11       0.07      0.14      0.10       202
          12       0.00      0.00      0.00       133
          13       0.00      0.00      0.00       196
          14       0.03      0.07      0.04       122
          15       0.05      0.19      0.08       219
          16       0.00      0.00      0.00       205
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
print(accuracy_score(y_true=y_true, y_pred=y_pred, average=None),
recall_score(y_true=y_true, y_pred=y_pred, average=None),
f1_score(y_true=y_true, y_pred=y_pred, average=None))


TypeError: accuracy_score() got an unexpected keyword argument 'average'

In [49]:
confusion_matrix(test_label, pred
#                  , labels=['K0', 'K1', 'K21', 'K211', 'K212', 'K2121', 'K2122', 'K2123', 'K213', 'K2131', 'K2132', 'K2133', 'K214', 'K22', 'K23', 'K24', 'K241', 'K242']
                )

array([[  0,   0,   0,   8,  11,  12,   8,   0,   2,   5,   0,  10,   0,
          6,   0,   1,   1,  35],
       [  0,   0,   0,   0,   1,   5,  16,   0,   2,   0,   3,  10,   0,
          2,   0,   2,  14,  17],
       [  0,   0,   0,   0,   4,   2,   6,   0,  25,   2,   0,   0,   0,
          7,   9,   8,   0,   7],
       [ 12,   3,   4,   5,  37,  36,  62,   1,  21,  10,  42,  36,   0,
          5,  35, 117,   7, 258],
       [  2,   1,   4,   1,  38,  24,  41,   0,  18,   5,  18,  36,   0,
          0,  23,  38,   3, 217],
       [ 10,   1,   0,   5,  40,  37,  37,   0,  13,   5,  27,  33,   0,
          2,  17,  36,   0, 172],
       [  6,   3,   0,  14,  22,  51,  50,   0,  22,   1,  33,  27,   0,
          3,  19,  86,   0, 126],
       [  9,   2,   0,  12,  27,  47,  57,   0,  29,   4,  34,  35,   0,
          4,  27, 142,   0, 160],
       [  1,   0,   0,   4,  50,  24,  26,   0,   6,   4,  20,  35,   0,
          0,  14,  50,   0, 122],
       [ 20,   5,   4,   9,  33,  22,

In [84]:
def eval_c09k(path, chkpt):
    model = BertForSequenceClassification.from_pretrained(os.path.join(path, chkpt), return_dict=True, num_labels=18)
    tokenizer = BertTokenizerFast.from_pretrained('c09k_pretrained_bert', vocab_size=8000, local_files_only=True)
    optimizer = AdamW(model1.parameters(), lr=1e-5)
    text_classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)
    # training_args = TrainingArguments(output_dir="c09k_finetuned_bert")
    training_args = TrainingArguments(
        output_dir="c09k_finetuned_bert_512",
        evaluation_strategy="epoch",    # 'steps': evaluate each `logging_steps`, 'epoch'  : each epoch
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
        save_steps=1000,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_finetune_dataset['test'],
#         compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    test_output = trainer.predict(test_dataset=tokenized_finetune_dataset['test'])
    y_pred = np.argmax(test_output.predictions, axis=1)
    y_true = np.array(test_dataset['label'])
    return [classification_report(y_true, y_pred), confusion_matrix(y_true, y_pred), y_pred]

In [85]:
chks = [['c09k_finetuned_bert_512', 'checkpoint-3000'],
['c09k_finetuned_bert_512', 'checkpoint-6000'],
['c09k_finetuned_bert_512', 'checkpoint-9000'],
['c09k_finetuned_bert_512', 'checkpoint-12000'],
['c09k_finetuned_bert_512', 'checkpoint-15000'],
['c09k_finetuned_bert_512', 'checkpoint-4000'],
['c09k_finetuned_bert_512', 'checkpoint-5000'],
['c09k_finetuned_bert_512', 'checkpoint-7000']]
result = []
for chk in chks:
    try:
        report, conf_mat, y_pred = eval_c09k(chk[0], chk[1])
        result.append([chk, report, conf_mat, y_pred])
    except:
        pass
#     finally:


loading configuration file c09k_finetuned_bert_512/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 1

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file c09k_finetuned_bert_512/checkpoint-6000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "l

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file c09k_finetuned_bert_512/checkpoint-4000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "l

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file c09k_finetuned_bert_512/checkpoint-5000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "l

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [86]:
for chk, report, confusion_mat, y_pred in result:
    print(chk, report, '\n', confusion_mat, '\n', '--------------------------------------------------------')

['c09k_finetuned_bert_512', 'checkpoint-3000']               precision    recall  f1-score   support

           0       0.00      0.00      0.00        99
           1       0.62      0.92      0.74        72
           2       0.89      0.70      0.78        70
           3       0.18      0.05      0.08       691
           4       0.25      0.41      0.32       469
           5       0.20      0.17      0.19       435
           6       0.15      0.27      0.19       463
           7       0.19      0.17      0.18       589
           8       0.17      0.06      0.09       356
           9       0.38      0.55      0.45       568
          10       0.26      0.24      0.25       296
          11       0.17      0.24      0.20       202
          12       0.00      0.00      0.00       133
          13       0.04      0.03      0.03       196
          14       0.21      0.25      0.22       122
          15       0.18      0.03      0.05       219
          16       0.29      0.41 

In [89]:
# best: ['c09k_finetuned_bert_512', 'checkpoint-5000']
y_pred = result[3][3]


array([10, 10,  5, ...,  7,  7,  6])

In [73]:
test1 = eval_c09k('c09k_finetuned_bert_512', 'checkpoint-3000')
test1

loading configuration file c09k_finetuned_bert_512/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 1

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


AttributeError: module 'datasets.metric' has no attribute 'compute'

In [46]:
from sklearn.metrics import accuracy_score

In [47]:
accuracy_score(pred, test_label)

0.24043820872573515

In [48]:
for i in range(10):
    try:
        print(test_dataset['text'][i])
        print('pred: ', text_classifier(test_dataset['text'][i]), ', label: ', test_dataset['label'][i], '\n')
    except:
        pass

보안 인쇄물의 위변조 확인 방법
pred:  [{'label': 'LABEL_7', 'score': 0.3763827383518219}] , label:  0 

보안 인쇄물의 위변조 확인 방법
pred:  [{'label': 'LABEL_7', 'score': 0.3763827383518219}] , label:  0 

  본 발명은 보안잉크가 인쇄된 보안 인쇄물에 관한 것으로서, 보다 상세하게는, 발광색상, 지속시간 또는 여기파장이 다른 형광체 또는 인광체를 포함하는 보안잉크가 인쇄된 보안 인쇄물에 관한 것이다. 이를 위해 보안 인쇄물은 제1영역 및 제2영역으로 구분되는 것으로서, 제1영역은 청색 형광체를 포함하는 보안잉크로 인쇄되고, 제2영역은 청색 형광체, 녹색 인광체 및 적색 인광체를 포함하는 보안잉크로 인쇄되는 것을 특징으로 한다.  
pred:  [{'label': 'LABEL_4', 'score': 0.227374866604805}] , label:  0 

  본 발명은 보안잉크가 인쇄된 보안 인쇄물에 관한 것으로서, 보다 상세하게는, 발광색상, 지속시간 또는 여기파장이 다른 형광체 또는 인광체를 포함하는 보안잉크가 인쇄된 보안 인쇄물에 관한 것이다. 이를 위해 보안 인쇄물은 제1영역 및 제2영역으로 구분되는 것으로서, 제1영역은 청색 형광체를 포함하는 보안잉크로 인쇄되고, 제2영역은 청색 형광체, 녹색 인광체 및 적색 인광체를 포함하는 보안잉크로 인쇄되는 것을 특징으로 한다.  
pred:  [{'label': 'LABEL_4', 'score': 0.227374866604805}] , label:  0 

UV 광원을 사용한 보안 인쇄물의 위변조 확인 방법에 있어서,피인쇄물을 준비하는 단계;UV 광원을 상기 제1영역 및 제2영역에 조사하여, 상기 제1영역은 제1색으로 발광하고, 제2영역은 제2색으로 발광하는 다색 발광 단계;UV 광원의 조사를 중지하는 단계; 및UV 조사가 중지된 이후, 상기 제1영역의 발광은 사라지고, 동시에 제2영역은 소

In [119]:
epoch_loss = 0
epoch_acc = 0
for epoch in range(n_epochs):
    scheduler.step()


0

In [86]:
# load the model checkpoint
model2 = BertForSequenceClassification.from_pretrained(
    os.path.join('c09k_finetuned_bert', "checkpoint-3000"),
    return_dict=True, num_labels=18)
# load the tokenizer
tokenizer2 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)

loading configuration file c09k_finetuned_bert/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-7500",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LA

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


In [88]:
# training_args = TrainingArguments(output_dir="c09k_finetuned_bert")
training_args = TrainingArguments(
    output_dir="c09k_finetuned_bert2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [89]:
trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized_finetune_dataset['train'],
    eval_dataset=tokenized_finetune_dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer2,
    data_collator=data_collator,
)

In [90]:
trainer2.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9881
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6180


Step,Training Loss
500,1.4499
1000,1.359
1500,1.3058
2000,1.2929
2500,1.2847
3000,1.2649
3500,1.2378
4000,1.2165
4500,1.1905
5000,1.175


Saving model checkpoint to c09k_finetuned_bert2/checkpoint-500
Configuration saved in c09k_finetuned_bert2/checkpoint-500/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in c09k_finetuned_bert2/checkpoint-500/tokenizer_config.json
Special tokens file saved in c09k_finetuned_bert2/checkpoint-500/special_tokens_map.json
Saving model checkpoint to c09k_finetuned_bert2/checkpoint-1000
Configuration saved in c09k_finetuned_bert2/checkpoint-1000/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in c09k_finetuned_bert2/checkpoint-1000/tokenizer_config.json
Special tokens file saved in c09k_finetuned_bert2/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to c09k_finetuned_bert2/checkpoint-1500
Configuration saved in c09k_finetuned_bert2/checkpoint-1500/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-1500/pytorch_model.bin
tok

TrainOutput(global_step=6180, training_loss=1.2481813918425426, metrics={'train_runtime': 810.8951, 'train_samples_per_second': 121.853, 'train_steps_per_second': 7.621, 'total_flos': 3250217273679360.0, 'train_loss': 1.2481813918425426, 'epoch': 10.0})

In [31]:
# https://bo-10000.tistory.com/154
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auroc': auc
    }



# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)


In [95]:
# load the model checkpoint
model3 = BertForSequenceClassification.from_pretrained(
    os.path.join('c09k_finetuned_bert2', "checkpoint-6000"),
    return_dict=True, num_labels=18)
# load the tokenizer
tokenizer3 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)
training_args = TrainingArguments(
    output_dir="c09k_finetuned_bert2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01, eval_steps=1000, resume_from_checkpoint=os.path.join('c09k_finetuned_bert', "checkpoint-3000")
)
trainer3 = Trainer(
    model=model3,
    args=training_args,
    train_dataset=tokenized_finetune_dataset['train'],
    eval_dataset=tokenized_finetune_dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer3,
    data_collator=data_collator
)
trainer3.train()

loading configuration file c09k_finetuned_bert/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-7500",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LA

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


Step,Training Loss
500,1.4499
1000,1.359
1500,1.3058
2000,1.2929
2500,1.2847
3000,1.2649
3500,1.2378
4000,1.2165
4500,1.1905
5000,1.175


Saving model checkpoint to c09k_finetuned_bert2/checkpoint-500
Configuration saved in c09k_finetuned_bert2/checkpoint-500/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in c09k_finetuned_bert2/checkpoint-500/tokenizer_config.json
Special tokens file saved in c09k_finetuned_bert2/checkpoint-500/special_tokens_map.json
Saving model checkpoint to c09k_finetuned_bert2/checkpoint-1000
Configuration saved in c09k_finetuned_bert2/checkpoint-1000/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in c09k_finetuned_bert2/checkpoint-1000/tokenizer_config.json
Special tokens file saved in c09k_finetuned_bert2/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to c09k_finetuned_bert2/checkpoint-1500
Configuration saved in c09k_finetuned_bert2/checkpoint-1500/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-1500/pytorch_model.bin
tok

TrainOutput(global_step=6180, training_loss=1.2481813918425426, metrics={'train_runtime': 816.681, 'train_samples_per_second': 120.99, 'train_steps_per_second': 7.567, 'total_flos': 3250217273679360.0, 'train_loss': 1.2481813918425426, 'epoch': 10.0})

In [53]:
test_label

NameError: name 'test_label' is not defined

### 테스트 결과 확인

In [49]:
# # https://bo-10000.tistory.com/154
# # https://stackoverflow.com/questions/59666138/sklearn-roc-auc-score-with-multi-class-ovr-should-have-none-average-available
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
#     acc = accuracy_score(labels, preds)
#     auc = roc_auc_score(labels, preds, multi_class="ovo",average='macro')
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall,
#         'auroc': auc
#     }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [55]:
# load the model checkpoint
model3 = BertForSequenceClassification.from_pretrained(
    os.path.join('c09k_finetuned_bert2', "checkpoint-6000"),
    return_dict=True, num_labels=18)
# load the tokenizer
tokenizer3 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)
training_args = TrainingArguments(
    output_dir="c09k_finetuned_bert2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01, eval_steps=1000, resume_from_checkpoint=os.path.join('c09k_finetuned_bert', "checkpoint-3000")
)
trainer3 = Trainer(
    model=model3,
    args=training_args,
    train_dataset=tokenized_finetune_dataset['train'],
    eval_dataset=tokenized_finetune_dataset['test'],
#     compute_metrics=compute_metrics,
    tokenizer=tokenizer3,
    data_collator=data_collator
)

loading configuration file c09k_finetuned_bert2/checkpoint-6000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_finetuned_bert/checkpoint-3000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LA

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


In [56]:
metrics = trainer3.evaluate(eval_dataset=tokenized_finetune_dataset['test'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5203
  Batch size = 16


In [57]:
metrics

{'eval_loss': 2.973349094390869,
 'eval_runtime': 10.3277,
 'eval_samples_per_second': 503.791,
 'eval_steps_per_second': 31.566}

In [58]:
test_output = trainer3.predict(test_dataset=tokenized_finetune_dataset['test'])
pred = np.argmax(test_output.predictions, axis=1)
pred

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5203
  Batch size = 16


array([9, 9, 9, ..., 7, 9, 9])

In [59]:
# test_label = np.array(test_dataset['label'])
test_label = np.array(test_dataset['label'])
test_label

array([0, 0, 0, ..., 9, 9, 9])

In [38]:
tokenized_finetune_dataset['test']

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5203
})

In [60]:
len(pred)

5203

In [72]:
result_df = pd.DataFrame()
result_df['label'] = test_label
result_df['pred'] = pred
result_df['input'] = np.array(test_dataset['text'])

In [74]:
result_df.loc[result_df['label']==result_df['pred']]

Unnamed: 0,label,pred,input
99,1,1,"화합물, 이를 포함하는 시스테인 탐지용 조성물, 및 시스테인 검출 방법"
100,1,1,"본 발명은 화합물, 이를 포함하는 시스테인 탐지용 조성물, 및 시스테인 검출 방..."
102,1,1,"1. 발명은 화합물, 이를 포함하는 시스테인 탐지용 조성물, 및 시스테인 검출 방법..."
104,1,1,"화합물, 이를 포함하는 시스테인 탐지용 조성물, 및 시스테인 검출 방법 본 발명..."
105,1,1,"화합물, 이를 포함하는 시스테인 탐지용 조성물, 및 시스테인 검출 방법 하기 화학식..."
...,...,...,...
5194,9,9,유기 전계발광 재료 및 디바이스
5197,9,9,"1. 하기 화학식 I, 화학식 II, 화학식 III 또는 화학식 IV의 리간드..."
5199,9,9,"유기 전계발광 재료 및 디바이스 하기 화학식 I, 화학식 II, 화학식 III ..."
5201,9,9,"유기 전계발광 재료 및 디바이스 1. 하기 화학식 I, 화학식 II, 화학식 ..."


In [75]:
result_df.loc[result_df['label']!=result_df['pred']]

Unnamed: 0,label,pred,input
0,0,9,보안 인쇄물의 위변조 확인 방법
1,0,9,보안 인쇄물의 위변조 확인 방법
2,0,9,"본 발명은 보안잉크가 인쇄된 보안 인쇄물에 관한 것으로서, 보다 상세하게는, 발..."
3,0,9,"본 발명은 보안잉크가 인쇄된 보안 인쇄물에 관한 것으로서, 보다 상세하게는, 발..."
4,0,1,"UV 광원을 사용한 보안 인쇄물의 위변조 확인 방법에 있어서,피인쇄물을 준비하는 단..."
...,...,...,...
5193,3,9,유기금속 화합물 및 이를 포함한 유기 발광 소자 1. 하기 화학식 1로 표시되는 유...
5195,9,7,"하기 화학식 I, 화학식 II, 화학식 III 또는 화학식 IV의 리간드 LA를..."
5196,9,7,"하기 화학식 I, 화학식 II, 화학식 III 또는 화학식 IV의 리간드 LA를 포..."
5198,9,7,"1. 양태에서, 본 개시는 하기 화학식 I, 화학식 II, 화학식 III 또는 화학..."


In [89]:
with open('data/c09k_label_ind.pickle', 'rb') as f:
    label_ind = pickle.load(f)
ind_label = {v:k for k, v in label_ind.items()}
# ind_label = pickle.loads('data/c09k_ind_label.pickle', )
# label_ind = pickle.loads('data/c09k_label_ind.pickle')

In [90]:
result_df2 = result_df.replace({'label': ind_label, 'pred': ind_label})

In [93]:
result_df2.to_csv('data/predict_result_c09k.csv', encoding='utf-8', index=False)