##  Fine-tune a pretrained model :  
## NSMC 감성분석 데이터 학습 및 분류기 만들기  

In [1]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [2]:
# Hugging face에서 데이터 읽어오기
from datasets import load_dataset
nsmc = load_dataset('nsmc')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/11.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
# 데이터 확인해 보기
nsmc

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [4]:
nsmc['train']

Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 150000
})

In [5]:
# train data, test_data 분리 (학습 시간 상 2000개만 샘플링)
train_data = nsmc['train'].shuffle(seed=42).select(range(2000))
test_data = nsmc['test'].shuffle(seed=42).select(range(2000))

In [6]:
# train, test 데이터 확인해 보기
print(train_data)
print(test_data)

Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 2000
})
Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 2000
})


In [7]:
# 감성분석 'bert-base-multilingual-cased' 모델 로드
from transformers import AutoModelForSequenceClassification, AutoTokenizer

MODEL_NAME = 'bert-base-multilingual-cased'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [8]:
# tokenizer.tokenize 테스트
train_data['document'][0]
tokenizer.tokenize(train_data['document'][0])

['For',
 'Carl',
 '.',
 '칼',
 '세',
 '##이',
 '##건',
 '##으로',
 '시',
 '##작',
 '##해서',
 '칼',
 '세',
 '##이',
 '##건',
 '##으로',
 '끝',
 '##난',
 '##다',
 '.']

In [9]:
# tokenizer테스트 (input_ids : 인덱싱 // token_type_ids : 버트 내부 구조상 가지고 있는 인자. 문서 단위의 순서 // attention_mask : 패딩)
tokenizer(train_data['document'][0])

{'input_ids': [101, 11399, 12225, 119, 9788, 9435, 10739, 71439, 11467, 9485, 38709, 70146, 9788, 9435, 10739, 71439, 11467, 8977, 33305, 11903, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
# data encoding (tokenizing)
# train_encoding
train_encoding = tokenizer(
    train_data['document'],
    return_tensors='pt', # 파이터치
    padding = True,
    truncation = True
    )
# test_encoding
test_encoding = tokenizer(
    test_data['document'],
    return_tensors='pt', # 파이터치
    padding = True,
    truncation = True
    )

In [11]:
# Encoding된 데이터, 개수 확인하기
train_encoding

{'input_ids': tensor([[   101,  11399,  12225,  ...,      0,      0,      0],
        [   101,  25701,   9279,  ...,      0,      0,      0],
        [   101,   9061, 119309,  ...,      0,      0,      0],
        ...,
        [   101,  80956,  79633,  ...,      0,      0,      0],
        [   101,   8924, 118729,  ...,      0,      0,      0],
        [   101,   9353,   9420,  ...,      0,      0,      0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [12]:
# BERT Special token
print(f"special token ids : {tokenizer.all_special_ids}")
print(f"special token tokens : {tokenizer.all_special_tokens}")

special token ids : [100, 102, 0, 101, 103]
special token tokens : ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


[Bert fine-tuning: 학습 데이터 만들기]
1. input_ids
2. token_type_ids
3. attention_mask  
----------------------> tokenizer를 활용하여 손쉽게 구성 완료
4. labels

In [13]:
# custom 데이터 셋 class 구성
import torch
from torch.utils.data import Dataset

class NSMCDataset(Dataset):
    def __init__(self, encodings, labels):
      self.encoding = encodings
      self.labels = labels

    # 인덱스를 입력받고 그에 맵핑되는 토큰을 출력함(파이토치 tensor) // 각 데이터가 생긴게 다르기 때문에
    def __getitem__(self, idx):
        data = {key: val[idx] for key, val in self.encoding.items()}
        data['labels'] = torch.tensor(self.labels[idx]).long()
        return data

    def __len__(self):
        return len(self.labels)

In [14]:
# class 실행
train_set = NSMCDataset(train_encoding, train_data['label'])
test_set = NSMCDataset(test_encoding, test_data['label'])

In [15]:
# train셋 확인
train_set[0]

{'input_ids': tensor([  101, 11399, 12225,   119,  9788,  9435, 10739, 71439, 11467,  9485,
         38709, 70146,  9788,  9435, 10739, 71439, 11467,  8977, 33305, 11903,
           119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0,

In [16]:
# test셋 확인
test_set[0]

{'input_ids': tensor([  101, 14796, 27728, 10230,   106,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [17]:
# !pip install accelerate -U -> Trainer 클래스 사용 시

In [18]:
import transformers
import accelerate

print(transformers.__version__)
print(accelerate.__version__)


4.40.2
0.30.1


In [19]:
!pip install accelerate -U



In [20]:
# Trainer 클래스로 학습하기
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = './outputs',
    logging_dir = './logs',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=50,
    save_steps=50,
    save_total_limit=2
)

In [24]:
# 학습을 위해 gpu 사용 확인 -> gpu 가능: 'cuda' , gpu 불가능 : 'cpu'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [25]:
# datasets의 load_metric을 활용하여 성능 측정
from datasets import load_metric

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.pred.predictions.argmax(-1)
  matrix_1 = load_metric('accuracy')
  matrix_2 = load_metric('f1')

  acc = matrix_1.compute(predictions=preds, references=labels)['accuracy']
  acc = matrix_2.compute(predictions=preds, references=labels)['f1']

  return {f'accuracy':acc, 'f1': f1}

In [23]:
# model GPU 사용
model.to(device)
# Trainer 구성
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)

In [None]:
# 모델 학습하기
trainer.train()

In [None]:
#{'loss': 0.6646, 'learning_rate': 4.603174603174603e-05, 'epoch': 0.79}
# {'loss': 0.5363, 'learning_rate': 4.2063492063492065e-05, 'epoch': 1.59}
# {'loss': 0.3994, 'learning_rate': 3.809523809523809e-05, 'epoch': 2.38}
# {'loss': 0.3099, 'learning_rate': 3.412698412698413e-05, 'epoch': 3.17}
# {'loss': 0.188, 'learning_rate': 3.0158730158730158e-05, 'epoch': 3.97}
# {'loss': 0.1881, 'learning_rate': 2.6190476190476192e-05, 'epoch': 4.76}
# {'loss': 0.1174, 'learning_rate': 2.2222222222222223e-05, 'epoch': 5.56}
# {'loss': 0.0919, 'learning_rate': 1.8253968253968254e-05, 'epoch': 6.35}
# {'loss': 0.0817, 'learning_rate': 1.4285714285714285e-05, 'epoch': 7.14}
# {'loss': 0.0565, 'learning_rate': 1.0317460317460318e-05, 'epoch': 7.94}
# {'loss': 0.0391, 'learning_rate': 6.349206349206349e-06, 'epoch': 8.73}
# {'loss': 0.0426, 'learning_rate': 2.3809523809523808e-06, 'epoch': 9.52}
# {'train_runtime': 1618.84, 'train_samples_per_second': 12.355, 'train_steps_per_second': 0.389, 'train_loss': 0.2167289758485461, 'epoch': 10.0}
# TrainOutput(global_step=630, training_loss=0.2167289758485461, metrics={'train_runtime': 1618.84, 'train_samples_per_second': 12.355, 'train_steps_per_second': 0.389, 'train_loss': 0.2167289758485461, 'epoch': 10.0})

In [None]:
# 모델 평가하기
trainer.evaluate()

In [None]:
# {'eval_loss': 1.198122501373291,
#  'eval_accuracy': 0.766,
#  'eval_f1': 0.7662337662337662,
#  'eval_runtime': 46.6907,
#  'eval_samples_per_second': 42.835,
#  'eval_steps_per_second': 1.349,
#  'epoch': 10.0}

In [None]:
# ! pip install ipywidgets
# ! conda install -c conda-forge ipympl
# !jupyter labextension list

# PyTorch를 활용한 네이티브 방식으로 학습하기  
# : TrainingArguments와 Trainer를 활용하지 않고 학습

In [17]:
from torch.utils.data import DataLoader

# dataloader
train_loader = DataLoader(train_set, batch_size=32)
test_loader = DataLoader(test_set, batch_size=32)

In [18]:
# model load
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# model 확인
dummy = tokenizer(train_data['document'][0], return_tensors='pt')
print(dummy)
model(**dummy)

{'input_ids': tensor([[  101, 11399, 12225,   119,  9788,  9435, 10739, 71439, 11467,  9485,
         38709, 70146,  9788,  9435, 10739, 71439, 11467,  8977, 33305, 11903,
           119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


SequenceClassifierOutput(loss=None, logits=tensor([[-0.2521, -0.2285]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [20]:
from tqdm.notebook import tqdm
from datasets import load_metric

# train 메소드
def train(epoch, model, dataloader, optimizer, device):
    model.to(device)
    m1 = load_metric('accuracy')
    m2 = load_metric('f1')
    for e in range(1, epoch+1):
      total_loss = 0.
      preds = []
      labels = []
      progress_bar = tqdm(dataloader, desc=f'TRAIN - EPOCH {e} |')
      for data in progress_bar :
        data = {k:v.to(device) for k, v in data.items()}
        output = model(**data)
        # loss 출력
        current_loss = output.loss
        total_loss += current_loss

        preds += list(output.logits.argmax(-1)) # 로짓 확률값 변환
        labels += list(data['labels'].detach().cpu().numpy())

        # 파라미터 update
        optimizer.zero_grad()
        current_loss.backward()
        optimizer.step()
        progress_bar.set_description(f'TRAIN - EPOCH {e} | current-loss: {current_loss:.4f}')
      # 학습 완료 후 acc와 f1 출력
      acc = m1.compute(predictions=preds, reference=labels)['accuracy']
      f1 = m2.compute(predictions=preds, reference=labels)['f1']
      avg = total_loss / len(dataloader)

      print('='*64)
      print(f'TRAIN - EPOCH {e} | Loss : {avg:.4f} F1: {f1:.4f} ')
      print('='*64)

In [21]:
# evaluate 메소드
def evaluate(model, dataloader, device):
    model.to(device)

    m1 = load_metric('accuracy')
    m2 = load_metric('f1')

    total_loss = 0.
    preds = []
    labels = []
    progress_bar = tqdm(dataloader, desc=f'EVAL |')
    for data in progress_bar :
      data = {k:v.to(device) for k, v in data.items()}

      with torch.no_grad(): # 파라미터 업데이트 안됨 // eval 할 때 쌍으로 많이 나옴
          output = model(**data)

      # loss 출력
      current_loss = output.loss
      total_loss += current_loss

      preds += list(output.logits.argmax(-1)) # 로짓 확률값 변환
      labels += list(data['labels'].detach().cpu().numpy())
      progress_bar.set_description(f'EVAL | current-loss: {current_loss:.4f}')

      # 학습 완료 후 acc와 f1 출력
      acc = m1.compute(predictions=preds, reference=labels)['accuracy']
      f1 = m2.compute(predictions=preds, reference=labels)['f1']
      avg = total_loss / len(dataloader)

      print('='*64)
      print(f'EVAL | Loss : {avg:.4f} F1: {f1:.4f} ')
      print('='*64)

In [22]:
# optimizer
from torch.optim import AdamW
# model load
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# train
train(5, model, train_loader, optimizer, device)

In [None]:
# evaluate
evaluate(model, test_loader, device)

In [None]:
# EVAL | LOSS: 1.2046 ACC: 0.7710 F1: 0.7644