# Hugging Face Hackathon 2023 튜토리얼

![](https://www.hpcwire.com/wp-content/uploads/2022/12/hugging-face.png)

작성자: 정우준 (karl7ung@gmail.com)

## 시작 전 준비

In [2]:
!pip install -q transformers
!pip install -q datasets
!pip install accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2

In [3]:
from collections import defaultdict, Counter
import json

import numpy as np
import torch

# 토큰화한 결과를 보기 편하게 만드는 함수
def print_encoding(model_inputs, indent=4):
    indent_str = " " * indent
    print("{")
    for k, v in model_inputs.items():
        print(indent_str + k + ":")
        print(indent_str + indent_str + str(v))
    print("}")

## Part 0. 전체 과정 미리보기

![](https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/full_nlp_pipeline.svg)

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained("WhitePeak/bert-base-cased-Korean-sentiment")
# 모델 불러오기
model = AutoModelForSequenceClassification.from_pretrained("WhitePeak/bert-base-cased-Korean-sentiment")

Downloading (…)okenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [5]:
# 전체 과정 미리보기
inputs = "음.... usb 직접 연결해서 충전하는게 편한데.. 아쉽네요.. 방수 때문에 그런건가..."
tokenized_inputs = tokenizer(inputs, return_tensors="pt")
outputs = model(**tokenized_inputs)

labels = ['NEGATIVE', "POSITIVE"]
predictions = torch.argmax(outputs.logits)


print("Input:")
print(inputs)
print()
print("Tokenized Inputs:")
print_encoding(tokenized_inputs)
print()
print("Model Outputs:")
print(outputs)
print()
print(f"The prediction is {labels[predictions]}")

Input:
음.... usb 직접 연결해서 충전하는게 편한데.. 아쉽네요.. 방수 때문에 그런건가...

Tokenized Inputs:
{
    input_ids:
        tensor([[   101,   9634,    119,    119,    119,    119,  19626,  10457,  67288,
           9568,  74322,  70146,   9770,  16617,  12178,  14153,   9924,  11102,
          28911,    119,    119,   9519, 119072,  77884,  48549,    119,    119,
           9328,  15891,  20729,   8924,  56710,  71439,  11287,    119,    119,
            119,    102]])
    token_type_ids:
        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
}

Model Outputs:
SequenceClassifierOutput(loss=None, logits=tensor([[ 2.5508, -2.3483]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

The prediction is NEGATIVE


## Part 1. 기본적인 허깅페이스 사용법

### 1.1 토크나이저(Tokenizers)

BERT 같은 트랜스포머 모델은 원시 문자열을 입력으로 받지 못합니다. 그래서 문자열을 모델이 이해할 수 있는 형태로 바꿔주는 과정이 필요합니다.  
**토크나이저**는 텍스트를 토큰으로 나누고, 그 토큰을 정수에 매핑하는 역할을 합니다.  

In [6]:
# 토크나이저의 종류
from transformers import DistilBertTokenizer, DistilBertTokenizerFast, AutoTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")        # Python
print(tokenizer)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")    # Rust
print(tokenizer)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")              # 제일 편리한 방법 (Default: Fast)
print(tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

DistilBertTokenizer(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padd

In [7]:
# 토크나이저를 사용하는 법
input_str = "Welcome to the Hugging Face Hackathon 2023!"
tokenized_inputs = tokenizer(input_str)


print("Vanilla Tokenization")
print_encoding(tokenized_inputs)
print()

# Input_id에 접근하는 방법 2가지
print(tokenized_inputs.input_ids)
print(tokenized_inputs["input_ids"])

Vanilla Tokenization
{
    input_ids:
        [101, 12050, 1106, 1103, 20164, 10932, 10289, 11679, 2158, 9779, 1320, 17881, 1495, 106, 102]
    attention_mask:
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}

[101, 12050, 1106, 1103, 20164, 10932, 10289, 11679, 2158, 9779, 1320, 17881, 1495, 106, 102]
[101, 12050, 1106, 1103, 20164, 10932, 10289, 11679, 2158, 9779, 1320, 17881, 1495, 106, 102]


In [8]:
cls = [tokenizer.cls_token_id]
sep = [tokenizer.sep_token_id]

# 토큰화가 이루어지는 단계
input_tokens = tokenizer.tokenize(input_str)                # 1. 텍스트를 토큰으로 분리
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)   # 2. 토큰을 input ID로 변환
input_ids_special_tokens = cls + input_ids + sep            # 3. 특수 토큰([CLS], [SEP]) 추가

decoded_str = tokenizer.decode(input_ids_special_tokens)    # 디코딩(Decoding)

print("start:                   ", input_str)
print("tokenize:                ", input_tokens)
print("convert_tokens_to_ids:   ", input_ids)
print("add special tokens:      ", input_ids_special_tokens)
print("=========================")
print("decode:                  ", decoded_str)

start:                    Welcome to the Hugging Face Hackathon 2023!
tokenize:                 ['Welcome', 'to', 'the', 'Hu', '##gging', 'Face', 'Ha', '##ck', '##ath', '##on', '202', '##3', '!']
convert_tokens_to_ids:    [12050, 1106, 1103, 20164, 10932, 10289, 11679, 2158, 9779, 1320, 17881, 1495, 106]
add special tokens:       [101, 12050, 1106, 1103, 20164, 10932, 10289, 11679, 2158, 9779, 1320, 17881, 1495, 106, 102]
decode:                   [CLS] Welcome to the Hugging Face Hackathon 2023! [SEP]


In [9]:
# Fast Tokenizer를 사용하는 경우, 다른 방법을 사용할 수도 있습니다!
inputs = tokenizer._tokenizer.encode(input_str)

print(input_str)
print("="*50)
print(f"Number of tokens: {len(inputs)}")
print(f"Ids: {inputs.ids}")
print(f"Tokens: {inputs.tokens}")
print(f"Special tokens mask: {inputs.special_tokens_mask}")
print()
print("char_to_token는 입력의 글자가 어떤 wordpiece에 있는지 알려줍니다")
char_idx = 15
print(f"예를 들어, 입력의 {char_idx + 1}번째 글자는 '{input_str[char_idx]}'이고, {inputs.char_to_token(char_idx)}번째 wordpiece인 {inputs.tokens[inputs.char_to_token(char_idx)]}의 일부입니다.")

Welcome to the Hugging Face Hackathon 2023!
Number of tokens: 15
Ids: [101, 12050, 1106, 1103, 20164, 10932, 10289, 11679, 2158, 9779, 1320, 17881, 1495, 106, 102]
Tokens: ['[CLS]', 'Welcome', 'to', 'the', 'Hu', '##gging', 'Face', 'Ha', '##ck', '##ath', '##on', '202', '##3', '!', '[SEP]']
Special tokens mask: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

char_to_token는 입력의 글자가 어떤 wordpiece에 있는지 알려줍니다
예를 들어, 입력의 16번째 글자는 'H'이고, 4번째 wordpiece인 Hu의 일부입니다.


In [10]:
# 유용한 기술들
# 토크나이저는 PyTorch tensor를 리턴할 수 있습니다!
model_inputs = tokenizer("Hugging Face Transformers is cool!", return_tensors='pt') # Tensorflow 나 Jax도 가능합니다!
print("PyTorch Tensors:")
print_encoding(model_inputs)

PyTorch Tensors:
{
    input_ids:
        tensor([[  101, 20164, 10932, 10289, 25267,  1110,  4348,   106,   102]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])
}


In [11]:
# 여러 문장을 한 번에 토크나이즈하고 패딩할 수 있습니다
model_inputs = tokenizer(["Hugging Face Transformers is cool!",
                          "The quick brown fox jumps over the lazy dog. Then the do got up and ran away because she didn't like foxes."],
                         return_tensors='pt',
                         padding=True,
                         truncation=True)
print(f"Pad token: {tokenizer.pad_token} | Pad token id: {tokenizer.pad_token_id}")
print("Padding:")
print_encoding(model_inputs)

Pad token: [PAD] | Pad token id: 0
Padding:
{
    input_ids:
        tensor([[  101, 20164, 10932, 10289, 25267,  1110,  4348,   106,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1109,  3613,  3058, 17594, 15457,  1166,  1103, 16688,  3676,
           119,  1599,  1103,  1202,  1400,  1146,  1105,  1868,  1283,  1272,
          1131,  1238,   112,   189,  1176, 17594,  1279,   119,   102]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])
}


In [12]:
# 마찬가지로 여러 문장을 한 번에 디코딩할 수 있습니다
print("Batch Decode:")
print(tokenizer.batch_decode(model_inputs.input_ids))
print()
print("Batch Decode: (no special characters)")
print(tokenizer.batch_decode(model_inputs.input_ids, skip_special_tokens=True))

Batch Decode:
['[CLS] Hugging Face Transformers is cool! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]', "[CLS] The quick brown fox jumps over the lazy dog. Then the do got up and ran away because she didn't like foxes. [SEP]"]

Batch Decode: (no special characters)
['Hugging Face Transformers is cool!', "The quick brown fox jumps over the lazy dog. Then the do got up and ran away because she didn't like foxes."]


#### 상품 리뷰 감정 분류 예시

In [13]:
# 쇼핑 감성 분석 예제

# 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained("WhitePeak/bert-base-cased-Korean-sentiment")

input_str = "음.... usb 직접 연결해서 충전하는게 편한데.. 아쉽네요.. 방수 때문에 그런건가..."

print("Tokenization: ")
tokenized_input = tokenizer(input_str)
print_encoding(tokenized_input)
print()

decoded_str = tokenizer.decode(tokenized_input.input_ids)
print("Decode:", decoded_str)

Tokenization: 
{
    input_ids:
        [101, 9634, 119, 119, 119, 119, 19626, 10457, 67288, 9568, 74322, 70146, 9770, 16617, 12178, 14153, 9924, 11102, 28911, 119, 119, 9519, 119072, 77884, 48549, 119, 119, 9328, 15891, 20729, 8924, 56710, 71439, 11287, 119, 119, 119, 102]
    token_type_ids:
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    attention_mask:
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}

Decode: [CLS] 음.... usb 직접 연결해서 충전하는게 편한데.. 아쉽네요.. 방수 때문에 그런건가... [SEP]


### 1.2 모델(Models)

허깅페이스의 큰 장점 중 하나는 다른 사람이나 기업이 올린 모델을 쉽게 가져다 쓸 수 있다는 것입니다!

모델을 그냥 사용할 수도 있고, 자신이 필요한 태스크에 따라서 헤드를 붙여서 사용할 수도 있습니다.  
헤드의 종류는 아래와 같습니다.  

```
*
*ForMaskedLM
*ForSequenceClassification
*ForTokenClassification
*ForQuestionAnswering
*ForMultipleChoice
...
```
https://huggingface.co/docs/transformers/model_doc/auto

![](https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/transformer_and_head.svg)

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

input_str = "Hugging Face Transformers is cool!"

print(input_str)
print("="*50)
model_inputs = tokenizer(input_str, return_tensors='pt')

Hugging Face Transformers is cool!


In [15]:
from transformers import AutoModelForSequenceClassification, DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=2)


Downloading model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sequence Classification 태스크에 대한 파라미터는 아직 학습되지 않았기 때문에 위와 같은 경고가 뜹니다.

In [16]:

# 방법 1
model_outputs = model(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask)

# 방법 2
model_outputs = model(**model_inputs)

print(model_inputs)
print()
print(model_outputs)
print()

model_inputs = tokenizer(input_str, return_tensors='pt')

labels = ["NEGATIVE", "POSITIVE"]
model_inputs['labels'] = torch.tensor([1])

model_outputs = model(**model_inputs)


print(model_outputs)
print()
print(f"Model predictions: {labels[model_outputs.logits.argmax()]}")

{'input_ids': tensor([[  101, 20164, 10932, 10289, 25267,  1110,  4348,   106,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0369,  0.0166]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

SequenceClassifierOutput(loss=tensor(0.6667, grad_fn=<NllLossBackward0>), logits=tensor([[-0.0369,  0.0166]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Model predictions: POSITIVE


In [17]:
# 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
# 모델 불러오기
model = AutoModelForSequenceClassification.from_pretrained("monologg/kobert")

input_str = "조용한건 좋은데 냄새가 너무너무 납니다. 환기를 해도 냄새가 안빠지는데 몸에는 무해할지 걱정이네요."
model_inputs = tokenizer(input_str, return_tensors='pt')
model_outputs = model(**model_inputs)

print(model_outputs)
print(f"Model predictions: {labels[model_outputs.logits.argmax()]}")

Downloading (…)okenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SequenceClassifierOutput(loss=None, logits=tensor([[-0.2253, -0.2035]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
Model predictions: POSITIVE


#### 상품 리뷰 감정 분류 예시

In [18]:
# 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained("WhitePeak/bert-base-cased-Korean-sentiment")
# 모델 불러오기
model = AutoModelForSequenceClassification.from_pretrained("WhitePeak/bert-base-cased-Korean-sentiment")

input_str = "조용한건 좋은데 냄새가 너무너무 납니다. 환기를 해도 냄새가 안빠지는데 몸에는 무해할지 걱정이네요."
model_inputs = tokenizer(input_str, return_tensors='pt')
model_outputs = model(**model_inputs)

print(model_outputs)
print(f"Model predictions: {labels[model_outputs.logits.argmax()]}")

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.8190, -1.8091]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
Model predictions: NEGATIVE


## Part 2. 파인튜닝(Fine-tuning)



https://huggingface.co/datasets

### 2.1 데이터셋 불러오기

In [25]:
from datasets import load_dataset, DatasetDict

amazon_dataset = load_dataset("KETI-AIR/kor_amazon_polarity")

def truncate(example):
    return {
        "content": " ".join(example['content'].split()[:20]),
        "label": example['label']
    }

small_amazon_dataset = DatasetDict(
    train=amazon_dataset['train'].shuffle(seed=777).select(range(128)).map(truncate),
    val=amazon_dataset['train'].shuffle(seed=777).select(range(128, 160)).map(truncate),
)

small_amazon_dataset['train'][:10]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

{'label': [0, 0, 1, 0, 1, 1, 0, 0, 1, 1],
 'title': ['작은 남자의 촌스러운 작은 책',
  '무슨 일이야, 프랭크?',
  'BIC American DV62si 스피커',
  ' Zero STARS',
  '가격 대비 고급 커버',
  'SUG GLOVE 좋은 작업용 장갑',
  '흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐흐

In [26]:


small_amazon_dataset = small_amazon_dataset.remove_columns(["title", 'data_index_by_user'])
small_amazon_dataset["train"][:10]

{'label': [0, 0, 1, 0, 1, 1, 0, 0, 1, 1],
 'content': ['스테레오타입적 사고(말할 것도 없이 빈약한 글쓰기)에서 이 이해할 수 있는 운동을 읽는 내 인생의 세 시간을 낭비한 후, 존 그레이가',
  '나는 궁금하다. 맥코트는 어떻게 그렇게 빨리 패스트볼을 잃었을까?이 속편은 대부분의 속편과 마찬가지로 평평합니다.나는 우리 대부분의 필사적인 사람들이 특정한 양의 광채와',
  '이것들은 작은 방에 필요한 훌륭한 소형 스피커입니다.나는 내가 연주하는 음악에서 놀라운 선명도를 얻지만, 스피커가 더 광범위한 시스템에 적합하지 않다고 생각합니다.시스템이',
  '이것은 현재의 인간 상태에 영향을 미치는 DNA 없이 우리의 병든 세계가 할 수 있는 바로 그 종류의 "엔터테인먼트"이다.',
  '가방은 나일론으로 만들어졌으며 보관을 위한 일치하는 가방이 함께 제공됩니다.표준 캠핑 텐트와 유사한 소재로 제작되었습니다.예산 충당을 위해 확실히 그만한 가치가 있습니다.',
  '남편은 사랑하는 수그 장갑 한 레를 가지고 있었고, 교체가 필요할 정도로 많이 사용했습니다.원래 구매한 매장에서 찾을 수 없어서 아마존.com에서 주문할',
  '여성들은 아름답고 그것만이 긍정적이에요.줄거리는 지루하고 대사는 끔찍하다.디자이너 빅토리 포드는 가장 추한 옷을 만든다.니코 라일리는 가짜로 만난다.웬디 힐리, 진짜?그녀는 기업 임원이고',
  '전에는 대부분의 노래를 들었지만 LP에서 들었어요.CD는 다루기 쉽고 더 나은 사운드를 제공합니다.',
  '건강과 응급 처치에 대한 훌륭한 조언 외에도 아이들이 다양한 연령대에서 발달적으로 겪고 있는 일에 대한 좋은 설명이 있습니다.이것들은 우리가 수면',
  '토미 라이브를 본 사람이라면 누구를 위한 것인가.어쨌든 그 환상적인 날들과 일치하는 것은 아닙니다.그러나 눈을 감으면 토미가 그 무대에서 술집으로 올라가']}

In [32]:
# 데이터셋 준비 - 여기선 16개의 예제를 하나의 배치로 토큰화하겠습니다
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")

small_tokenized_dataset = small_amazon_dataset.map(
    lambda example: tokenizer(example['content'], padding=True, truncation=True),
    batched=True,
    batch_size=16
)

small_tokenized_dataset = small_tokenized_dataset.remove_columns(["content", "token_type_ids"])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [33]:
small_tokenized_dataset['train'][0:2]

{'labels': tensor([0, 0]),
 'input_ids': tensor([[   2,    0, 6496,   18,    0,    0, 6883,    0,    0,   40, 6903, 7096,
             0, 6629, 7142,    0,    0, 5678,    0, 6579,    0,    0, 7968,   46,
          7264,    0,    3,    1,    1,    1,    1,    1,    1],
         [   2, 5658,    0,   54,    0,    0,    0,    0,    0,    0,  258, 7096,
             0,    0,    0,    0,    0,   54, 5658, 7007,    0,    0,    0,    0,
             0,    0,    3,    1,    1,    1,    1,    1,    1]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 0, 0, 0, 0, 0, 0]])}

In [34]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)

### 2.2 학습하기

In [35]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm


model = AutoModelForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2)

num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

best_val_loss = float("inf")
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    # 학습(Training)
    model.train()
    for batch_i, batch in enumerate(train_dataloader):

        output = model(**batch)

        optimizer.zero_grad()
        output.loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)

    # 평가(Validation)
    model.eval()
    loss = 0
    for batch_i, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            output = model(**batch)
        loss += output.loss

    avg_val_loss = loss / len(eval_dataloader)
    print(f"Validation loss: {avg_val_loss}")
    if avg_val_loss < best_val_loss:
        print("Saving checkpoint!")
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss,
            },
            f"./epoch_{epoch}.pt"
        )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.6929049491882324
Saving checkpoint!
Validation loss: 0.6971858143806458
Validation loss: 0.6970363259315491


In [38]:
from transformers import TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
model = AutoModelForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2)

arguments = TrainingArguments(
    output_dir="sample_hf_trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    load_best_model_at_end=True,
    seed=224,
    logging_steps=5
)


def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": np.mean(predictions == labels)}


trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=small_tokenized_dataset['train'],
    eval_dataset=small_tokenized_dataset['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# 모델 훈련
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7328,0.678418,0.59375
2,0.6823,0.681364,0.59375
3,0.6617,0.680216,0.59375


TrainOutput(global_step=24, training_loss=0.6973832249641418, metrics={'train_runtime': 27.7682, 'train_samples_per_second': 13.829, 'train_steps_per_second': 0.864, 'total_flos': 7597331723520.0, 'train_loss': 0.6973832249641418, 'epoch': 3.0})

In [40]:
# evaluating the model is very easy

# results = trainer.evaluate()                           # just gets evaluation metrics
results = trainer.predict(small_tokenized_dataset['val']) # also gives you predictions

In [41]:
results

PredictionOutput(predictions=array([[ 0.1916449 ,  0.22683379],
       [ 0.22314717, -0.11641365],
       [ 0.20360216, -0.09543565],
       [ 0.19376771, -0.11019652],
       [ 0.22164695, -0.1203385 ],
       [ 0.15413123, -0.14183237],
       [ 0.1821831 , -0.11664201],
       [ 0.16595587, -0.13742277],
       [ 0.18666586, -0.08931123],
       [ 0.1758633 , -0.13222699],
       [ 0.18192168, -0.11654314],
       [ 0.20246285, -0.11635383],
       [ 0.17683929, -0.10863604],
       [ 0.2183371 , -0.10548168],
       [ 0.19470072, -0.09061436],
       [ 0.17531334, -0.12730497],
       [ 0.22123067, -0.08326611],
       [ 0.15778846, -0.11511832],
       [ 0.22013862, -0.0995794 ],
       [ 0.19786586, -0.08956669],
       [ 0.20206194,  0.24839233],
       [ 0.18367225, -0.10238333],
       [ 0.18084264, -0.14627627],
       [ 0.19924533, -0.14783694],
       [ 0.1901607 , -0.09713633],
       [ 0.1657788 , -0.12214693],
       [ 0.19704998, -0.11093342],
       [ 0.16072392, -0.12

In [42]:
# To load our saved model, we can pass the path to the checkpoint into the `from_pretrained` method:
test_str = "음.... usb 직접 연결해서 충전하는게 편한데.. 아쉽네요.. 방수 때문에 그런건가..."

finetuned_model = AutoModelForSequenceClassification.from_pretrained("sample_hf_trainer/checkpoint-24")
model_inputs = tokenizer(test_str, return_tensors="pt")
prediction = torch.argmax(finetuned_model(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask).logits)
print(["NEGATIVE", "POSITIVE"][prediction])

POSITIVE


## Part 3. Hugging Face에 모델 업로드

In [43]:
!pip install huggingface_hub



In [44]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [46]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1699642165.b519d41a4941.318.0:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

'https://huggingface.co/jungnerd/sample_hf_trainer/tree/main/'