In [1]:
import tensorflow
import numpy
import transformers
import datasets

print(tensorflow.__version__)
print(numpy.__version__)
print(transformers.__version__)
print(datasets.__version__)

2.6.0
1.21.4
4.11.3
1.14.0


## 1. NSMC 데이터 분석 및 Huggingface dataset 구성

In [13]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# 데이터셋 불러오기
dataset = load_dataset('nsmc')

Using custom data configuration default
Reusing dataset nsmc (/aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
spilt_dataset = dataset['train'].train_test_split(test_size=0.2)
spilt_dataset

Loading cached split indices for dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-1a6f83a5d3943579.arrow and /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-6007649e922b3575.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 30000
    })
})

In [21]:
train_dataset, validation_dataset = spilt_dataset['train'], spilt_dataset['test']
train_dataset, validation_dataset

(Dataset({
     features: ['id', 'document', 'label'],
     num_rows: 120000
 }),
 Dataset({
     features: ['id', 'document', 'label'],
     num_rows: 30000
 }))

In [25]:
train_dataset = train_dataset.shuffle(seed=42).select(range(50000))
validation_dataset = validation_dataset.shuffle(seed=42).select(range(10000))
test_dataset = dataset['test'].shuffle(seed=42).select(range(5000))

train_dataset, validation_dataset, test_dataset

Loading cached shuffled indices for dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-a06d82eadff10d75.arrow
Loading cached shuffled indices for dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-3530610e74f8043f.arrow
Loading cached shuffled indices for dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-1e99ce74af12773e.arrow


(Dataset({
     features: ['id', 'document', 'label'],
     num_rows: 50000
 }),
 Dataset({
     features: ['id', 'document', 'label'],
     num_rows: 10000
 }),
 Dataset({
     features: ['id', 'document', 'label'],
     num_rows: 5000
 }))

## STEP 2. klue/bert-base model 및 tokenizer 불러오기

In [47]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = 'klue/bert-base'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

loading file https://huggingface.co/klue/bert-base/resolve/main/vocab.txt from cache at /aiffel/.cache/huggingface/transformers/1a36e69d48a008e522b75e43693002ffc8b6e6df72de7c53412c23466ec165eb.085110015ec67fc02ad067f712a7c83aafefaf31586a3361dd800bcac635b456
loading file https://huggingface.co/klue/bert-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/klue/bert-base/resolve/main/special_tokens_map.json from cache at /aiffel/.cache/huggingface/transformers/aeaaa3afd086a040be912f92ffe7b5f85008b744624f4517c4216bcc32b51cf0.054ece8d16bd524c8a00f0e8a976c00d5de22a755ffb79e353ee2954d9289e26
loading file https://huggingface.co/klue/bert-base/resolve/main/tokenizer_config.json from cache at /aiffel/.cache/huggingface/transformers/f8f71eb411bb03f57b455cfb1b4e04ae124201312e67a3ad66e0a92d0c228325.78871951edcb66032caa0a9628d77b3557c23616c653dacdb7a1a8f33011a843
loading file https://huggingface.co/klue/bert-base/resolve/main/tokenizer.json from cache at /aiffe

## STEP 3. 위에서 불러온 tokenizer으로 데이터셋을 전처리하고, model 학습 진행해 보기

In [40]:
from datasets import load_metric
metric = load_metric('glue', 'mrpc')

def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

In [41]:
import numpy as np
from datasets import load_metric
from transformers import TrainingArguments, Trainer

def preprocess_function(examples):
    return tokenizer(examples["document"], truncation=True)

train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

metric = load_metric("accuracy")


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch", 
    save_strategy="epoch",  # 에폭 단위로 저장
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running training *****
  Num examples = 50000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4689


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2736,0.263816,0.8885
2,0.1869,0.2764,0.8915
3,0.1216,0.34525,0.8926


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1563
Configuration saved in ./results/checkpoint-1563/config.json
Model weights saved in ./results/checkpoint-1563/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1563/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1563/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-3126
Configuration saved in ./results/checkpoint-3126/config.json
Model weights saved in ./results/checkpoint-3126/pytorch_model.bin
tokenizer conf

TrainOutput(global_step=4689, training_loss=0.2077511905174719, metrics={'train_runtime': 2077.5307, 'train_samples_per_second': 72.201, 'train_steps_per_second': 2.257, 'total_flos': 5629828362640320.0, 'train_loss': 0.2077511905174719, 'epoch': 3.0})

In [42]:
trainer.evaluate(test_dataset)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 32


{'eval_loss': 0.2662367820739746,
 'eval_accuracy': 0.8876,
 'eval_runtime': 22.6217,
 'eval_samples_per_second': 221.026,
 'eval_steps_per_second': 6.94,
 'epoch': 3.0}

## STEP 4. Fine-tuning을 통하여 모델 성능(accuarcy) 향상시키기
모델의 성능을 향상시키기 위해 다음과 같은 방법을 사용

- 데이터 전처리 개선: 데이터 클리닝, 정규화 등
- 하이퍼파라미터 튜닝: 학습률, 배치크기, 에폭 수 등 조정
- 더 큰 사전 학습 모델 사용: BERT-large, RoBERTa 등

In [43]:
# 데이터 전처리 개선
import re

def preprocess_function(examples):
    texts = examples["document"]
    texts = [re.sub(r'[^ ㄱ-ㅣ가-힣 . , ? ! ]', '', text) for text in texts] # 한글 이외 문자 제거
    texts = [text.strip() for text in texts] # 앞뒤 공백 제거
    
    return tokenizer(texts, truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# 하이퍼파라미터 튜닝 
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=500,
    load_best_model_at_end=True,  # 최상의 모델만 저장
    report_to='none'  # 로깅 비활성화
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running training *****
  Num examples = 50000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 7815


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2275,0.30364,0.8734
2,0.157,0.325778,0.8831
3,0.0989,0.480885,0.8787
4,0.0642,0.558432,0.8869
5,0.0405,0.650467,0.885


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1563
Configuration saved in ./results/checkpoint-1563/config.json
Model weights saved in ./results/checkpoint-1563/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1563/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1563/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-3126
Configuration saved in ./results/checkpoint-3126/config.json
Model weights saved in ./results/checkpoint-3126/pytorch_model.bin
tokenizer conf

TrainOutput(global_step=7815, training_loss=0.1194945931510901, metrics={'train_runtime': 5583.758, 'train_samples_per_second': 44.773, 'train_steps_per_second': 1.4, 'total_flos': 1.644444096e+16, 'train_loss': 0.1194945931510901, 'epoch': 5.0})

In [44]:
trainer.evaluate(test_dataset)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 32


{'eval_loss': 0.2956825792789459,
 'eval_accuracy': 0.8788,
 'eval_runtime': 37.3128,
 'eval_samples_per_second': 134.002,
 'eval_steps_per_second': 4.208,
 'epoch': 5.0}

## STEP 5. Bucketing을 적용하여 학습시키고, STEP 4의 결과와의 비교

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=500,
    group_by_length=True, # Bucketing 적용
    load_best_model_at_end=True,  # 최상의 모델만 저장
    report_to='none'  # 로깅 비활성화
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running training *****
  Num examples = 50000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 7815


Epoch,Training Loss,Validation Loss


In [36]:
trainer.evaluate(test_dataset)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document, id.
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 32


{'eval_loss': 0.34401822090148926,
 'eval_accuracy': 0.8834,
 'eval_runtime': 37.3282,
 'eval_samples_per_second': 133.947,
 'eval_steps_per_second': 4.206,
 'epoch': 5.0}