# Transformer with Huggingface
- 이전까지는 모델에 신경썼지만, 이제는 학습된 가중치를 어떻게 사용하느냐
- BERT: transformer 인코더 > MLM & NSP: pre=train 후 fine-tuning 재학습
- 모델을 어떻게 학습시키느냐와 사전학습 weight(더 중요)
- KoBERT: 한국어 자료로 사전학습
- 또한 상황에 따라 모델 크기 다르게 사용 GPT small, large...(디코더레이어 수와 hidden dim size가 다름), 커질수록 더많은 GPU자원 필요
- 모델/가중치마다 tokenizer 다르게 사용
- Architecture, Pretrained Weight, Config(scale, dim), Tokenizer

huggingface는 이러한 과정을 단순화
- models에서 weight variation 탐색 가능

In [1]:
pip install transformers




In [2]:
from transformers import BertConfig, BertForMaskedLM
# 실제 코드 확인하고 싶다면 
from transformers.models.bert.modeling_bert import BertForMaskedLM

In [3]:
# 직접 학습시킬 경우
config = BertConfig(vocab_size=40000, hidden_size=256, num_hidden_layers=4, num_attention_heads=4, intermediate_size=1024, max_position_embeddings=1024)
model = BertForMaskedLM(config)
print(model)
# 최종적으로 vocab_size=40000 -> out_features=40000 

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(40000, 256, padding_idx=0)
      (position_embeddings): Embedding(1024, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=T

In [4]:
# 사전 학습 활용할 경우

# 몇 층 쌓았고, vocab_size, hidden_dim_size 등등을 가지고옴
# 12층, attentionhead=12, pad_token = 0, vocab_size=30522, GELU... 등등
# 가중치 가져오지 않으면, random_initailize됨
# uncased: 대소문자 구분X
config = BertConfig.from_pretrained("bert-base-uncased")
model = BertForMaskedLM(config)
print(model)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [5]:
# weight까지 들고오는 경우
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
print(model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [6]:
from transformers import BertTokenizerFast

In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [8]:
from pprint import pprint
text = ["Hello, My name is Seokjin Oh", "I go to school"]
pprint(tokenizer.tokenize(text))
pprint(tokenizer(text, return_tensors='pt', padding=True))
# input_ids: subword가 숫자로 매핑된 것
# attention_mask: 토큰들의 길이가 다를 경우 패딩, 1은 
#                 리스트로 반환하면 패딩 없음
#                 텐서로 반환하면 패딩 존재
# token_type_ids: 문장 구분?
#                 <sep>으로 나뉘지 않았으므로 아래 값은 모두 0

['hello',
 ',',
 'my',
 'name',
 'is',
 'seo',
 '##k',
 '##jin',
 'oh',
 'i',
 'go',
 'to',
 'school']
{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]),
 'input_ids': tensor([[  101,  7592,  1010,  2026,  2171,  2003, 27457,  2243, 14642,  2821,
           102],
        [  101,  1045,  2175,  2000,  2082,   102,     0,     0,     0,     0,
             0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [9]:
# special token 있는지 확인
# 101=[CLS], 102=[SEP], 103=[MASK], 0=[PAD]
# Positional Encoding 태극마크 = Sinusoidal or Absolute Position

encodings = tokenizer(text, return_tensors='pt', padding=True)
print(tokenizer.convert_ids_to_tokens(encodings['input_ids'][0]))
print(tokenizer.convert_ids_to_tokens(encodings['input_ids'][1]))

# 57:30
# Q. [SEP]으로 나뉘게 되면 0/1로 바뀌는 것 아닌가? 나뉘었는데 왜 token_type_ids가 모두 0인지?
# Q. [EOS]는 왜 없는지?

['[CLS]', 'hello', ',', 'my', 'name', 'is', 'seo', '##k', '##jin', 'oh', '[SEP]']
['[CLS]', 'i', 'go', 'to', 'school', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [12]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
encodings = tokenizer(
    "We are very happy to [MASK] you the Transformers library", return_tensors='pt'
)
pprint(encodings)
# size = 1 x seq_length -> 인풋 시 batch_size x seq_length로 넣어줌

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,   103,  2017,  1996, 19081,
          3075,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [15]:
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
outputs = model(**encodings)

pprint(outputs.logits.argmax(dim=-1))
# output shape: B*S*V = 1* seq_length * vocab_size

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[ 1012,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
          1012,  1012]])


In [18]:
print(tokenizer.decode(outputs.logits.argmax(dim=-1).squeeze(0)))
# [MASK] == show

. we are very happy to show you the transformers..


In [20]:
# 학습된 모델을 파인튜닝 ->  BertyForSequenceClassification
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [23]:
tokenizer("ABC", "DEF")

{'input_ids': [101, 5925, 102, 13366, 102], 'token_type_ids': [0, 0, 0, 1, 1], 'attention_mask': [1, 1, 1, 1, 1]}

In [24]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [26]:
import os
from datasets import load_dataset
data = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to C:\Users\OhSeokjin\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to C:\Users\OhSeokjin\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
data['train'][0:10]

In [28]:
import re
def preprocessing(sample):

    return {
        'text': ' '.join(re.sub(r'<[^(?:/>)]+/>', ' ', sample['text']).split()),
        'label': sample['label']
    }

In [29]:
data['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [30]:
preprocessed = data.map(preprocessing)

  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/50000 [00:00<?, ?ex/s]

In [32]:
preprocessed['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself. The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it\'s

In [35]:
preprocessed = preprocessed.map(
    lambda sample: tokenizer(sample['text'], truncation=True),
    remove_columns=['text'],
    batched=True
)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [39]:
preprocessed['train'][0]

{'label': 0,
 'input_ids': [101,
  1045,
  12524,
  1045,
  2572,
  8025,
  1011,
  3756,
  2013,
  2026,
  2678,
  3573,
  2138,
  1997,
  2035,
  1996,
  6704,
  2008,
  5129,
  2009,
  2043,
  2009,
  2001,
  2034,
  2207,
  1999,
  3476,
  1012,
  1045,
  2036,
  2657,
  2008,
  2012,
  2034,
  2009,
  2001,
  8243,
  2011,
  1057,
  1012,
  1055,
  1012,
  8205,
  2065,
  2009,
  2412,
  2699,
  2000,
  4607,
  2023,
  2406,
  1010,
  3568,
  2108,
  1037,
  5470,
  1997,
  3152,
  2641,
  1000,
  6801,
  1000,
  1045,
  2428,
  2018,
  2000,
  2156,
  2023,
  2005,
  2870,
  1012,
  1996,
  5436,
  2003,
  8857,
  2105,
  1037,
  2402,
  4467,
  3689,
  3076,
  2315,
  14229,
  2040,
  4122,
  2000,
  4553,
  2673,
  2016,
  2064,
  2055,
  2166,
  1012,
  1999,
  3327,
  2016,
  4122,
  2000,
  3579,
  2014,
  3086,
  2015,
  2000,
  2437,
  2070,
  4066,
  1997,
  4516,
  2006,
  2054,
  1996,
  2779,
  25430,
  14728,
  2245,
  2055,
  3056,
  2576,
  3314,
  2107,
  2004,
  1

In [40]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer)

In [41]:
collator(preprocessed['train'][:5])

{'input_ids': tensor([[  101,  1045, 12524,  ...,     0,     0,     0],
        [  101,  1000,  1045,  ...,     0,     0,     0],
        [  101,  2065,  2069,  ...,     0,     0,     0],
        [  101,  2023,  2143,  ...,     0,     0,     0],
        [  101,  2821,  1010,  ...,  2007,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([0, 0, 0, 0, 0])}