In [1]:
import os
import torch
import numpy as np
from sklearn.metrics import accuracy_score
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


#### dataset 다운로드

In [3]:
nsmc_dataset = load_dataset('nsmc', trust_remote_code=True)
print(nsmc_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})


#### Tokenizer & Model

In [4]:
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Dynamic padding이 적용되려면 tokenizer의 padding 인수값이 True로 설정되어야 한다. 그런데 이 부분이 'max_length'로 설정되어 있어 적용이 제대로 되지 않았다

In [5]:
def transform(data):
    return tokenizer(
        data['document'],
        truncation=True,
        padding='max_length',
        return_token_type_ids=False,
        )

In [6]:
train_dataset = nsmc_dataset['train'].map(transform, batched=True)
test_dataset = nsmc_dataset['test'].map(transform, batched=True)

#### train & evaluate

In [7]:
def accuracy(eval_pred):    
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}  # dict 형태로 반환

In [None]:
# output_dir = os.getenv('HOME')+'/aiffel/huggingface'
output_dir = os.getenv('HOME')+'/Desktop/Quest08'

training_arguments = TrainingArguments(
    output_dir, 
    evaluation_strategy="epoch",  # evaluation하는 주기   
    save_strategy="epoch",  # 체크포인트 저장 주기를 스텝 단위로 설정
    load_best_model_at_end=True,  # 가장 좋은 모델을 마지막에 불러오기
    metric_for_best_model="accuracy",  # 'accuracy'가 가장 큰 모델을 선택
    greater_is_better=True,  # accuracy가 가장 클수록 좋음
    per_device_train_batch_size = 16, 
    per_device_eval_batch_size = 16, 
    num_train_epochs = 3, 
    group_by_length=False, # dynamic padding 적용
)



In [9]:
# # freeze parameters of BERT 
# for param in model.bert.parameters():
#     param.requires_grad = False

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# classifier만 fine-tuning하도록 설정하려면 model.classifier.parameters()로 수정
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=1e-4, 
                             weight_decay=0.01) 

trainer = Trainer(
    model=model, 
    args=training_arguments, 
    train_dataset=train_dataset, 
    eval_dataset=test_dataset, 
    optimizers=(optimizer, None),
    compute_metrics=accuracy,
    data_collator=data_collator
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4658,0.437237,0.80704
2,0.4839,0.474406,0.8084
3,0.4764,0.466758,0.82614


TrainOutput(global_step=28125, training_loss=0.4734176708984375, metrics={'train_runtime': 10185.9436, 'train_samples_per_second': 44.179, 'train_steps_per_second': 2.761, 'total_flos': 1.18399974912e+17, 'train_loss': 0.4734176708984375, 'epoch': 3.0})

In [11]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.46675801277160645,
 'eval_accuracy': 0.82614,
 'eval_runtime': 306.683,
 'eval_samples_per_second': 163.035,
 'eval_steps_per_second': 10.19,
 'epoch': 3.0}

#### dynamic padding 유무에 따른 실험 결과
 
- with dynamic padding  
![Image](https://github.com/user-attachments/assets/70e6d2b9-b38c-4883-b6d6-b35317eaacb9)  
- without dynamic padding  
![Image](https://github.com/user-attachments/assets/a9ed9bfc-8d8f-429d-8e4f-81b62ed60a10)  

##### 회고

- Dynamic padding을 적용했을 때가 학습/추론 속도가 6.5배 정도 더 빨랐다. 하이퍼파라미터에 따라 달라질 수 있겠지만, 성능 면에서도 오히려 좋았다.  
- 두 케이스 모두 90% 이상의 성능을 달성하진 못했다. 하지만 허깅페이스 프레임워크에 대해 학습할 수 있었고, 요즘 RAG 공부를 하고 있는데 문서 DB를 임베딩하고 query와의 유사도 측정에 허깅페이스의 pre-trained model이 유용하게 활용될 수 있겠다는 생각이 들었다.  