# 시드 통일하기

In [1]:
import torch
from transformers import set_seed


set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# 모델 불러오기

In [2]:
model_config = {
    "num_labels": 3,
    "id2label": {0: 0, 1: 1, 2: 2},
    "label2id": {0: 0, 1: 1, 2: 2}
}

In [3]:
# from transformers import KoBertForSequenceClassification
# from tokenization_kobert import KoBertTokenizer


# model = KoBertForSequenceClassification.from_pretrained("monologg/kobert-discriminator", **model_config)
# tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert-discriminator", model_max_length=512)

from transformers import ElectraForSequenceClassification, ElectraTokenizer


model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", **model_config)
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", model_max_length=512)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

# 하이퍼 파라미터 설정

In [4]:
num_classes = 3

# 원하는 대로 고쳐서 사용
batch_size = 64
lr = 5e-5  # 0.00005
epochs = 150

In [5]:
# 옵티마이저와 스케줄러
# 원하는 대로 고쳐서 사용

# from torch.optim import AdamW
# from transformers import AdamW, get_linear_schedule_with_warmup


# LENGTH_OF_TRAIN_DATA = 40242
# num_training_steps = ((LENGTH_OF_TRAIN_DATA - 1) // batch_size + 1) * epochs
# optimizer = AdamW(model.parameters(), lr=lr)
# scheduler = get_linear_schedule_with_warmup(optimizer, int(num_training_steps * 0.1), num_training_steps)

# 만약 이 스케줄러를 사용할 경우,
# optimizer.step() 바로 다음에
# scheduler.step()을 호출해야 함

# f1 score 계산하기

In [6]:
# 1. torchmetrics 사용
# requirements에 넣어놨으니 따로 설치할 필요 없음

# from torchmetrics import F1


# f1_score = F1(num_classes=num_classes)

# # 검증 단계에서 사용
# for inputs, labels in val_loader:
#     # 대충 코드
#     output = model(inputs)  # 대충 아웃풋
#     pred = torch.argmax(output, dim=1)
#     batch_f1 = f1_score(pred, labels)
#     print(batch_f1)

# f1 = f1_score.compute()
# print(f1)

In [7]:
# 2. datasets 사용
# requirements에 넣어놨으니 따로 설치할 필요 없음

# from datasets import load_metric


# f1_score = load_metric("f1")

# for inputs, labels in val_loader:
#     # 대충 코드
#     output = model(inputs)
#     pred = torch.argmax(output, dim=1)
#     f1_score.add_batch(predictions=pred, references=labels)

# f1 = f1_score.compute()
# print(f1)

# 훈련

In [8]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred, average="macro")

    return {"accuracy": accuracy, "f1": f1}

In [9]:
from data_loader import get_data_loaders


train_data, val_data, test_data = get_data_loaders(tokenizer, return_loader=False)

Using custom data configuration default-eafb513e725dbeca
Reusing dataset csv (C:\Users\cglab\.cache\huggingface\datasets\csv\default-eafb513e725dbeca\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Loading cached processed dataset at C:\Users\cglab\.cache\huggingface\datasets\csv\default-eafb513e725dbeca\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff\cache-d815bbee38dcbc26.arrow
Loading cached processed dataset at C:\Users\cglab\.cache\huggingface\datasets\csv\default-eafb513e725dbeca\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff\cache-5c9fdd80709ca508.arrow
Loading cached processed dataset at C:\Users\cglab\.cache\huggingface\datasets\csv\default-eafb513e725dbeca\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff\cache-60328c4920383600.arrow


In [10]:
model_name = "KoELECTRA_base_v3_datav2"
save_path = f"{model_name}"

In [11]:
from transformers import TrainingArguments, EarlyStoppingCallback

from trainer import ImbalancedSamplerTrainer


args = TrainingArguments(
    output_dir=save_path,
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    num_train_epochs=150,
    logging_steps=1000,
    save_steps=1000,
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    adafactor=True,
    lr_scheduler_type="cosine_with_restarts",
    learning_rate=1e-4,
)

trainer = ImbalancedSamplerTrainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=20)],
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 45154
  Num Epochs = 150
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 423450


Step,Training Loss,Validation Loss,Accuracy,F1
1000,1.0431,0.813652,0.762654,0.443088
2000,0.7004,0.365383,0.874053,0.637602
3000,0.4271,0.36021,0.868274,0.676597
4000,0.315,0.295772,0.901554,0.718292
5000,0.2318,0.265334,0.919291,0.749149
6000,0.1997,0.350936,0.912714,0.745613
7000,0.1866,0.302968,0.918693,0.745362
8000,0.1636,0.405137,0.91112,0.757225
9000,0.143,0.449312,0.910323,0.743393
10000,0.1305,0.373581,0.928059,0.758698


***** Running Evaluation *****
  Num examples = 5018
  Batch size = 16
Saving model checkpoint to KoELECTRA_base_v3_datav2\checkpoint-1000
Configuration saved in KoELECTRA_base_v3_datav2\checkpoint-1000\config.json
Model weights saved in KoELECTRA_base_v3_datav2\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5018
  Batch size = 16
Saving model checkpoint to KoELECTRA_base_v3_datav2\checkpoint-2000
Configuration saved in KoELECTRA_base_v3_datav2\checkpoint-2000\config.json
Model weights saved in KoELECTRA_base_v3_datav2\checkpoint-2000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5018
  Batch size = 16
Saving model checkpoint to KoELECTRA_base_v3_datav2\checkpoint-3000
Configuration saved in KoELECTRA_base_v3_datav2\checkpoint-3000\config.json
Model weights saved in KoELECTRA_base_v3_datav2\checkpoint-3000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5018
  Batch size = 16
Saving model checkpoint to KoELECTRA_b

Configuration saved in KoELECTRA_base_v3_datav2\checkpoint-28000\config.json
Model weights saved in KoELECTRA_base_v3_datav2\checkpoint-28000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5018
  Batch size = 16
Saving model checkpoint to KoELECTRA_base_v3_datav2\checkpoint-29000
Configuration saved in KoELECTRA_base_v3_datav2\checkpoint-29000\config.json
Model weights saved in KoELECTRA_base_v3_datav2\checkpoint-29000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5018
  Batch size = 16
Saving model checkpoint to KoELECTRA_base_v3_datav2\checkpoint-30000
Configuration saved in KoELECTRA_base_v3_datav2\checkpoint-30000\config.json
Model weights saved in KoELECTRA_base_v3_datav2\checkpoint-30000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5018
  Batch size = 16
Saving model checkpoint to KoELECTRA_base_v3_datav2\checkpoint-31000
Configuration saved in KoELECTRA_base_v3_datav2\checkpoint-31000\config.json
Model weights saved in 

TrainOutput(global_step=34000, training_loss=0.1651127061282887, metrics={'train_runtime': 22777.24, 'train_samples_per_second': 297.363, 'train_steps_per_second': 18.591, 'total_flos': 1.430894961898537e+17, 'train_loss': 0.1651127061282887, 'epoch': 12.04})

In [13]:
trainer.save_model(output_dir=save_path)

Saving model checkpoint to KoELECTRA_base_v3_datav2
Configuration saved in KoELECTRA_base_v3_datav2\config.json
Model weights saved in KoELECTRA_base_v3_datav2\pytorch_model.bin


In [14]:
trainer.save_state()