### Sentence Sampling Classifier
* c09k 분류 학습 모델을 사용하여 분류한 결과 분류 예측에 성공한 문장과 실패한 문장을 구분
* 분류 예측에 실패한 문장의 라벨을 0, 성공한 문장의 라벨을 1로 하여 학습 데이터를 생성
* 분류 예측에 활용될 수 있는 문장을 선별하는 분류기를 학습

In [1]:
import os
import json
import re
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import *
from tokenizers import *
from datasets import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import nltk
from nltk.data import load
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Loading the c09k Classify Model

In [2]:
from transformers import AdamW

In [3]:
tokenizer_path = 'c09k_pretrained_bert'
model_load_path = 'c09k_finetuned_bert_512'
chkpoint = 'checkpoint-5000'
model_save_path = 'c09k_sampling_model'

In [22]:
# load the model checkpoint
model1 = BertForSequenceClassification.from_pretrained(os.path.join(model_load_path, chkpoint), return_dict=True, num_labels=18)
# load the tokenizer
tokenizer1 = BertTokenizerFast.from_pretrained(tokenizer_path, vocab_size=8000, local_files_only=True)
optimizer = AdamW(model1.parameters(), lr=1e-5)
# optimizer = A AdamW(model1.parameters(), lr=1e-5)

loading configuration file c09k_finetuned_bert_512/checkpoint-5000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 1

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json




In [7]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model1.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model1.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

### finetuning data prepare
* 성능 비교를 목적으로 KoBERT의 학습 데이터를 그대로 사용

In [8]:
from torch.nn import functional as F

In [9]:
train_data_df = pd.read_csv('data/train_C09K11_220715.txt', sep='\t')  # text와 라벨 파일
test_dataset_df = pd.read_csv('data/test_C09K11_220715.txt', sep='\t')
train_dataset = Dataset.from_pandas(train_data_df)  # Dataset 객체 생성
test_dataset = Dataset.from_pandas(test_dataset_df)
finetune_dataset = DatasetDict()  # DatasetDict 객체 생성
finetune_dataset['train'] = train_dataset
finetune_dataset['test'] = test_dataset
finetune_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9881
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5203
    })
})

In [10]:
encoding = tokenizer1(train_dataset['text'][:16], return_tensors='pt', padding=True, truncation=True, max_length=512)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [11]:
train_data_df[:3]

Unnamed: 0,text,label
0,광활성 형광체 프로브 및 이를 이용한 암세포 검출방법,1
1,"본 발명은 광활성 형광체 검출방법에 관한 것으로서, 화학식 1로 표시되는 화합물...",1
2,하기 [화학식 1]로 표시되는 OPA 또는 TPA 구조체를 포함하는 활용한 광활성 ...,1


In [12]:
num_warmup_steps = 2
n_epochs = 5
num_train_steps = n_epochs + 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

In [13]:
# 토크나이저 함수 객체 생성
def preprocess_function(examples):
    return tokenizer1(examples["text"], truncation=True, max_length=512, padding=True)
def preprocess_function1(examples):
    return examples['label']

In [14]:
tokenized_finetune_dataset = finetune_dataset.map(preprocess_function, batched=True)  
# DatasetDict의 'text'를 토크나이징, finetune_dataset에는 features: ['text', 'label']만 있었으나,
# tokenized_finetune_dataset에는 'input_ids', 'token_type_ids', 'attention_mask'가 추가됨

100%|██████████| 10/10 [00:01<00:00,  5.91ba/s]
100%|██████████| 6/6 [00:00<00:00,  6.99ba/s]


In [15]:
tokenized_finetune_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9881
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5203
    })
})

In [16]:
# tokenized_finetune_dataset['train']

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer1)

In [19]:
# training_args = TrainingArguments(output_dir="c09k_finetuned_bert")
training_args = TrainingArguments(
    output_dir="c09k_finetuned_bert_512",
    evaluation_strategy="epoch",    # 'steps': evaluate each `logging_steps`, 'epoch'  : each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### c09k 모델 evaluation

In [25]:
# ?pipeline
text_classifier = pipeline('text-classification', model=model1, tokenizer=tokenizer1)

In [26]:
tokenizer1.vocab_size

8000

In [27]:
trainer = Trainer(
    model=model1,
    args=training_args,
    train_dataset=tokenized_finetune_dataset['train'],
    eval_dataset=tokenized_finetune_dataset['test'],
#     compute_metrics=compute_metrics,
#     tokenizer=tokenizer1,
    data_collator=data_collator,
)

In [28]:
test_output = trainer.predict(test_dataset=tokenized_finetune_dataset['test'])

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5203
  Batch size = 8


In [29]:
y_pred = np.argmax(test_output.predictions, axis=1)

In [30]:
y_true = np.array(test_dataset['label'])
y_true

array([0, 0, 0, ..., 9, 9, 9])

In [33]:
test_label = finetune_dataset['test']['label']

In [34]:
test_output.label_ids == test_label

array([ True,  True,  True, ...,  True,  True,  True])

In [36]:
result_df = pd.DataFrame()
result_df['label'] = y_true
result_df['pred'] = y_pred
result_df['input'] = np.array(test_dataset['text'])

In [37]:
test_output1 = trainer1.predict(test_dataset=tokenized_finetune_dataset['train'])

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 9881
  Batch size = 8


In [38]:
y_pred1 = np.argmax(test_output1.predictions, axis=1)
y_true1 = np.array(train_dataset['label'])
result_df1 = pd.DataFrame()
result_df1['label'] = y_true1
result_df1['pred'] = y_pred1
result_df1['input'] = np.array(train_dataset['text'])

In [57]:
result_total = pd.concat([result_df, result_df1])
result_total.reset_index(inplace=True)
result_total.pop('index')
result_total['target'] = result_total.apply(lambda x : 0 if x['pred'] != x['label'] else 1, axis=1)
result_total.pop('label')
result_total.pop('pred')
result_total.to_csv('data/c09k_sampling_data.csv', encoding='utf-8', index=False)

### split_train_test

In [58]:
from sklearn.model_selection import train_test_split

In [65]:
train_input, test_input = train_test_split(result_total, random_state=15, stratify=result_total['target'], shuffle=True, test_size=0.2)

In [66]:
print(train_input.shape, test_input.shape)  # (12067, 2) (3017, 2)

(12067, 2) (3017, 2)


In [86]:
train_input.to_csv('data/c09k_sampling_train_input.csv', sep='\t', encoding='utf-8', index=False)
test_input.to_csv('data/c09k_sampling_test_input.csv', sep='\t', encoding='utf-8', index=False)

### sampling model train set load

In [4]:
train_data_df1 = pd.read_csv('data/c09k_sampling_train_input.csv', sep='\t')  # text와 라벨 파일
test_dataset_df1 = pd.read_csv('data/c09k_sampling_test_input.csv', sep='\t')
train_data_df1.rename(columns={'input':'text', 'target':'label'}, inplace=True)
test_dataset_df1.rename(columns={'input':'text', 'target':'label'}, inplace=True)
train_dataset1 = Dataset.from_pandas(train_data_df1)  # Dataset 객체 생성
test_dataset1 = Dataset.from_pandas(test_dataset_df1)
finetune_dataset1 = DatasetDict()  # DatasetDict 객체 생성
finetune_dataset1['train'] = train_dataset1
finetune_dataset1['test'] = test_dataset1
finetune_dataset1

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12067
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3017
    })
})

### Training Sampling Model
* 9.6일의 Fine-tuning시 로드한 pre-trained model을 사용
     * model_path = "c09k_pretrained_bert_512_2", "checkpoint-9000"

In [7]:
# load the model checkpoint
model2 = BertForSequenceClassification.from_pretrained(
    os.path.join('c09k_pretrained_bert_512_2', 'checkpoint-9000'),
    return_dict=True, num_labels=2)
# load the tokenizer
tokenizer2 = BertTokenizerFast.from_pretrained(tokenizer_path, vocab_size=8000, local_files_only=True)
optimizer = AdamW(model2.parameters(), lr=1e-5)
# optimizer = A AdamW(model1.parameters(), lr=1e-5)

loading configuration file c09k_pretrained_bert_512_2/checkpoint-9000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512/checkpoint-15000",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8000
}

loading weights file c09k_pretrained_bert_512_2/checkpoint-9000/pytorch_model.bin
Some weights of the model checkpoint at c09k_pretrained_bert_512_2/checkpoint-9000 were not used when initializing BertForSequenceClassification: ['cls.pr

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


In [8]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model2.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model2.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [9]:
# 토크나이저 함수 객체 생성
def preprocess_function(examples):
    return tokenizer2(examples["text"], truncation=True, max_length=512, padding=True)
# def preprocess_function1(examples):
#     return examples['label']
tokenized_finetune_dataset2 = finetune_dataset1.map(preprocess_function, batched=True)  
data_collator2 = DataCollatorWithPadding(tokenizer=tokenizer2)

100%|██████████| 13/13 [00:02<00:00,  6.31ba/s]
100%|██████████| 4/4 [00:00<00:00,  8.14ba/s]


In [10]:
tokenized_finetune_dataset2

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12067
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3017
    })
})

In [30]:
# training_args = TrainingArguments(output_dir="c09k_finetuned_bert")
training_args2 = TrainingArguments(
    output_dir=model_save_path,
    evaluation_strategy="steps",    # 'steps': evaluate each `logging_steps`, 'epoch'  : each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=500,
    weight_decay=0.01,
)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [31]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [32]:
trainer2 = Trainer(
    model=model2,
    args=training_args2,
    train_dataset=tokenized_finetune_dataset2['train'],
    eval_dataset=tokenized_finetune_dataset2['test'],
#     compute_metrics=compute_metrics,
    tokenizer=tokenizer2,
    data_collator=data_collator2,
)

In [33]:
trainer2.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12067
  Num Epochs = 100
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 37700


Step,Training Loss,Validation Loss
500,0.52,0.625467
1000,0.5775,0.616343
1500,0.5552,0.653431
2000,0.5297,0.791353
2500,0.5275,0.740984
3000,0.521,0.767063
3500,0.5061,0.893558
4000,0.5029,0.88615
4500,0.4982,0.941815
5000,0.4907,0.953361


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3017
  Batch size = 8
Saving model checkpoint to c09k_sampling_model/checkpoint-500
Configuration saved in c09k_sampling_model/checkpoint-500/config.json
Model weights saved in c09k_sampling_model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in c09k_sampling_model/checkpoint-500/tokenizer_config.json
Special tokens file saved in c09k_sampling_model/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****

  Batch size = 8
Saving model checkpoint to c09k_sampling_model/checkpoint-6000
Configuration saved in c09k_sampling_model/checkpoint-6000/config.json
Model weights saved in c09k_sampling_model/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in c09k_sampling_model/checkpoint-6000/tokenizer_config.json
Special tokens file saved in c09k_sampling_model/checkpoint-6000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3017
  Batch size = 8
Saving model checkpoint to c09k_sampling_model/checkpoint-6500
Configuration saved in c09k_sampling_model/checkpoint-6500/config.json
Model weights saved in c09k_sampling_model/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in c09k_sampling_model/chec

Special tokens file saved in c09k_sampling_model/checkpoint-11500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3017
  Batch size = 8
Saving model checkpoint to c09k_sampling_model/checkpoint-12000
Configuration saved in c09k_sampling_model/checkpoint-12000/config.json
Model weights saved in c09k_sampling_model/checkpoint-12000/pytorch_model.bin
tokenizer config file saved in c09k_sampling_model/checkpoint-12000/tokenizer_config.json
Special tokens file saved in c09k_sampling_model/checkpoint-12000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertFor

RuntimeError: [enforce fail at inline_container.cc:319] . unexpected pos 619618496 vs 619618384

In [45]:
result_df2 = result_df.replace({'label': ind_label, 'pred': ind_label})
result_df2.to_csv('data/finetuned_predict_result_c09k.csv', encoding='utf-8', index=False)

In [70]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, classification_report

In [69]:
?accuracy_score

In [72]:
print(classification_report(y_true=y_true, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        99
           1       0.00      0.00      0.00        72
           2       0.00      0.00      0.00        70
           3       0.05      0.01      0.01       691
           4       0.10      0.08      0.09       469
           5       0.10      0.09      0.09       435
           6       0.11      0.11      0.11       463
           7       0.00      0.00      0.00       589
           8       0.03      0.02      0.02       356
           9       0.14      0.01      0.03       568
          10       0.04      0.04      0.04       296
          11       0.07      0.14      0.10       202
          12       0.00      0.00      0.00       133
          13       0.00      0.00      0.00       196
          14       0.03      0.07      0.04       122
          15       0.05      0.19      0.08       219
          16       0.00      0.00      0.00       205
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
print(accuracy_score(y_true=y_true, y_pred=y_pred, average=None),
recall_score(y_true=y_true, y_pred=y_pred, average=None),
f1_score(y_true=y_true, y_pred=y_pred, average=None))


TypeError: accuracy_score() got an unexpected keyword argument 'average'

In [49]:
confusion_matrix(test_label, pred
#                  , labels=['K0', 'K1', 'K21', 'K211', 'K212', 'K2121', 'K2122', 'K2123', 'K213', 'K2131', 'K2132', 'K2133', 'K214', 'K22', 'K23', 'K24', 'K241', 'K242']
                )

array([[  0,   0,   0,   8,  11,  12,   8,   0,   2,   5,   0,  10,   0,
          6,   0,   1,   1,  35],
       [  0,   0,   0,   0,   1,   5,  16,   0,   2,   0,   3,  10,   0,
          2,   0,   2,  14,  17],
       [  0,   0,   0,   0,   4,   2,   6,   0,  25,   2,   0,   0,   0,
          7,   9,   8,   0,   7],
       [ 12,   3,   4,   5,  37,  36,  62,   1,  21,  10,  42,  36,   0,
          5,  35, 117,   7, 258],
       [  2,   1,   4,   1,  38,  24,  41,   0,  18,   5,  18,  36,   0,
          0,  23,  38,   3, 217],
       [ 10,   1,   0,   5,  40,  37,  37,   0,  13,   5,  27,  33,   0,
          2,  17,  36,   0, 172],
       [  6,   3,   0,  14,  22,  51,  50,   0,  22,   1,  33,  27,   0,
          3,  19,  86,   0, 126],
       [  9,   2,   0,  12,  27,  47,  57,   0,  29,   4,  34,  35,   0,
          4,  27, 142,   0, 160],
       [  1,   0,   0,   4,  50,  24,  26,   0,   6,   4,  20,  35,   0,
          0,  14,  50,   0, 122],
       [ 20,   5,   4,   9,  33,  22,

In [84]:
def eval_c09k(path, chkpt):
    model = BertForSequenceClassification.from_pretrained(os.path.join(path, chkpt), return_dict=True, num_labels=18)
    tokenizer = BertTokenizerFast.from_pretrained('c09k_pretrained_bert', vocab_size=8000, local_files_only=True)
    optimizer = AdamW(model1.parameters(), lr=1e-5)
    text_classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)
    # training_args = TrainingArguments(output_dir="c09k_finetuned_bert")
    training_args = TrainingArguments(
        output_dir="c09k_finetuned_bert_512",
        evaluation_strategy="epoch",    # 'steps': evaluate each `logging_steps`, 'epoch'  : each epoch
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
        save_steps=1000,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_finetune_dataset['test'],
#         compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    test_output = trainer.predict(test_dataset=tokenized_finetune_dataset['test'])
    y_pred = np.argmax(test_output.predictions, axis=1)
    y_true = np.array(test_dataset['label'])
    return [classification_report(y_true, y_pred), confusion_matrix(y_true, y_pred), y_pred]

In [85]:
chks = [['c09k_finetuned_bert_512', 'checkpoint-3000'],
['c09k_finetuned_bert_512', 'checkpoint-6000'],
['c09k_finetuned_bert_512', 'checkpoint-9000'],
['c09k_finetuned_bert_512', 'checkpoint-12000'],
['c09k_finetuned_bert_512', 'checkpoint-15000'],
['c09k_finetuned_bert_512', 'checkpoint-4000'],
['c09k_finetuned_bert_512', 'checkpoint-5000'],
['c09k_finetuned_bert_512', 'checkpoint-7000']]
result = []
for chk in chks:
    try:
        report, conf_mat, y_pred = eval_c09k(chk[0], chk[1])
        result.append([chk, report, conf_mat, y_pred])
    except:
        pass
#     finally:


loading configuration file c09k_finetuned_bert_512/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 1

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file c09k_finetuned_bert_512/checkpoint-6000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "l

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file c09k_finetuned_bert_512/checkpoint-4000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "l

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file c09k_finetuned_bert_512/checkpoint-5000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert_512_2/checkpoint-9000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "l

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [86]:
for chk, report, confusion_mat, y_pred in result:
    print(chk, report, '\n', confusion_mat, '\n', '--------------------------------------------------------')

['c09k_finetuned_bert_512', 'checkpoint-3000']               precision    recall  f1-score   support

           0       0.00      0.00      0.00        99
           1       0.62      0.92      0.74        72
           2       0.89      0.70      0.78        70
           3       0.18      0.05      0.08       691
           4       0.25      0.41      0.32       469
           5       0.20      0.17      0.19       435
           6       0.15      0.27      0.19       463
           7       0.19      0.17      0.18       589
           8       0.17      0.06      0.09       356
           9       0.38      0.55      0.45       568
          10       0.26      0.24      0.25       296
          11       0.17      0.24      0.20       202
          12       0.00      0.00      0.00       133
          13       0.04      0.03      0.03       196
          14       0.21      0.25      0.22       122
          15       0.18      0.03      0.05       219
          16       0.29      0.41 

In [89]:
# best: ['c09k_finetuned_bert_512', 'checkpoint-5000']
y_pred = result[3][3]


array([10, 10,  5, ...,  7,  7,  6])

In [86]:
# load the model checkpoint
model2 = BertForSequenceClassification.from_pretrained(
    os.path.join('c09k_finetuned_bert', "checkpoint-3000"),
    return_dict=True, num_labels=18)
# load the tokenizer
tokenizer2 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)

loading configuration file c09k_finetuned_bert/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-7500",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LA

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


In [88]:
# training_args = TrainingArguments(output_dir="c09k_finetuned_bert")
training_args = TrainingArguments(
    output_dir="c09k_finetuned_bert2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [89]:
trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized_finetune_dataset['train'],
    eval_dataset=tokenized_finetune_dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer2,
    data_collator=data_collator,
)

In [95]:
# load the model checkpoint
model3 = BertForSequenceClassification.from_pretrained(
    os.path.join('c09k_finetuned_bert2', "checkpoint-6000"),
    return_dict=True, num_labels=18)
# load the tokenizer
tokenizer3 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)
training_args = TrainingArguments(
    output_dir="c09k_finetuned_bert2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01, eval_steps=1000, resume_from_checkpoint=os.path.join('c09k_finetuned_bert', "checkpoint-3000")
)
trainer3 = Trainer(
    model=model3,
    args=training_args,
    train_dataset=tokenized_finetune_dataset['train'],
    eval_dataset=tokenized_finetune_dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer3,
    data_collator=data_collator
)
trainer3.train()

loading configuration file c09k_finetuned_bert/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-7500",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LA

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


Step,Training Loss
500,1.4499
1000,1.359
1500,1.3058
2000,1.2929
2500,1.2847
3000,1.2649
3500,1.2378
4000,1.2165
4500,1.1905
5000,1.175


Saving model checkpoint to c09k_finetuned_bert2/checkpoint-500
Configuration saved in c09k_finetuned_bert2/checkpoint-500/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in c09k_finetuned_bert2/checkpoint-500/tokenizer_config.json
Special tokens file saved in c09k_finetuned_bert2/checkpoint-500/special_tokens_map.json
Saving model checkpoint to c09k_finetuned_bert2/checkpoint-1000
Configuration saved in c09k_finetuned_bert2/checkpoint-1000/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in c09k_finetuned_bert2/checkpoint-1000/tokenizer_config.json
Special tokens file saved in c09k_finetuned_bert2/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to c09k_finetuned_bert2/checkpoint-1500
Configuration saved in c09k_finetuned_bert2/checkpoint-1500/config.json
Model weights saved in c09k_finetuned_bert2/checkpoint-1500/pytorch_model.bin
tok

TrainOutput(global_step=6180, training_loss=1.2481813918425426, metrics={'train_runtime': 816.681, 'train_samples_per_second': 120.99, 'train_steps_per_second': 7.567, 'total_flos': 3250217273679360.0, 'train_loss': 1.2481813918425426, 'epoch': 10.0})

In [60]:
len(pred)

5203