<a href="https://colab.research.google.com/github/UiinKim/UiinKim/blob/main/BERT_%EC%8B%A4%EC%8A%B51.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. TPU 설정

In [64]:
#TPU 초기화
import tensorflow as tf
import os

resolver=tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://'+os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)



<tensorflow.python.tpu.topology.Topology at 0x79158ab935e0>

In [65]:
#TPU Strategy 세팅
strategy=tf.distribute.TPUStrategy(resolver)

In [66]:
#딥러닝 모델 컴파일
#모델의 컴파일은 strategy.scope 내에서 이루어져야 한다. crate_model()과 같은 함수를 만들어 strategy.scope 내에서 해당 함수를 호출하여 모델을 컴파일한다.
def create_model():
  return tf.keras.Sequential(
      [
          tf.keras.layers.Conv2D(256, 3, activation='relu', input_shape=(28,28,1)),
          tf.keras.layers.Conv2D(256, 3, activation='relu'),
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(256, activation='relu'),
          tf.keras.layers.Dense(128,activation='relu'),
          tf.keras.layers.Dense(10)
      ]
  )

In [67]:
#with strategy.scope()에 create_model()함수를 호출하여 모델을 컴파일한다.
with strategy.scope():
  model=create_model()
  model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['sparse_categorical_accuracy'])

2. Transformer 모델 클래스 불러오기

In [68]:
!pip install transformers



In [69]:
# 다 대 일 유형(many-to-one)
from transformers import TFBertForSequenceClassification

#model=TFBertForSequencesClassification.from_pretrained('모델 이름', num_labels=분류할 레이블 개수)


In [70]:
#다 대 다 유형(many-to-many)
from transformers import TFBertForTokenClassification

#model=TFBertForTokenClassification.from_pretrained("모델 이름", num_labels=분류할 레이블 개수)


In [71]:
#질의 응답 유형
from transformers import TFBertForQuestionAnswering

#model=TFBertForQuestionAnswering.from_pretrained('모델 이름')


In [72]:
import transformers
transformers.__version__

'4.31.0'

In [73]:
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [74]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x79158ab93070>)

In [75]:
train_data=pd.read_table('ratings_train.txt')
test_data=pd.read_table('ratings_test.txt')

In [76]:
print("train 데이터 개수 : ", len(train_data))
print("test 데이터 개수 : ", len(test_data))

train 데이터 개수 :  150000
test 데이터 개수 :  50000


In [77]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


In [78]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [79]:
train_data=train_data.dropna(how='any')
test_data=test_data.dropna(how='any')
print(len(train_data))
print(len(test_data))

149995
49997


In [80]:
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [81]:
#문장 정수 인코딩
print(tokenizer.encode('보는 내내 그대로 들어맞는 예측 카리스마 없는 악역'))

[101, 9356, 11018, 8996, 31605, 110589, 71568, 118913, 11018, 9576, 119281, 9786, 79940, 23811, 40364, 9520, 23160, 102]


In [82]:
#문장 토큰화
print(tokenizer.tokenize("보는 내내 그대로 들어맞는 예측 카리스마 없는 악역"))

['보', '##는', '내', '##내', '그대로', '들어', '##맞', '##는', '예', '##측', '카', '##리스', '##마', '없는', '악', '##역']


In [83]:
#인코드 후 디코드
print(tokenizer.decode(tokenizer.encode("보는 내내 그대로 들어맞는 예측 카리스마 없는 악역")))

[CLS] 보는 내내 그대로 들어맞는 예측 카리스마 없는 악역 [SEP]


In [84]:
for elem in tokenizer.encode("보는 내내 그대로 들어맞는 예측 카리스마 없는 악역"):
  print(tokenizer.decode(elem))

[ C L S ]
보
# # 는
내
# # 내
그 대 로
들 어
# # 맞
# # 는
예
# # 측
카
# # 리 스
# # 마
없 는
악
# # 역
[ S E P ]


In [85]:
#CLS와 SEP의 코드번호
print(tokenizer.decode(101))
print(tokenizer.decode(102))

[ C L S ]
[ S E P ]


In [86]:
print(tokenizer.cls_token, ':', tokenizer.cls_token_id)
print(tokenizer.sep_token, ':', tokenizer.sep_token_id)

[CLS] : 101
[SEP] : 102


In [87]:
print(tokenizer.pad_token, ':', tokenizer.pad_token_id)

[PAD] : 0


In [88]:
#문장의 최대 길이 설정
max_seq_len=128

In [89]:
#128까지 패딩
encoded_result=tokenizer.encode("전율을 일으키는 영화, 다시 보고싶은 영화", max_length=max_seq_len, pad_to_max_length=True)
print(encoded_result)
print("길이 : ", len(encoded_result))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[101, 9665, 119183, 10622, 9641, 119185, 66815, 42428, 117, 25805, 98199, 119088, 10892, 42428, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
길이 :  128




In [90]:
#세그먼트 인풋
print([0]*max_seq_len) #최대 길이만큼 0으로 설정

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [91]:
#마스크 인풋
valid_num=len(tokenizer.encode("전율을 일으키는 영화, 다시 보고싶은 영화"))
print(valid_num*[1]+(max_seq_len-valid_num)*[0]) #문장의 의미가 담긴 만큼 1 나머지는 0

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [96]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
  input_ids, attention_masks, token_type_ids, data_labels=[],[],[],[]

  for example, label in tqdm(zip(examples, labels), total=len(examples)):
       # input_id는 워드 임베딩을 위한 문장의 정수 인코딩
       input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)

       # attention_mask는 실제 단어가 위치하면 1, 패딩의 위치에는 0인 시퀀스.
       padding_count = input_id.count(tokenizer.pad_token_id)
       attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count

       # token_type_id는 세그먼트 임베딩을 위한 것으로 이번 예제는 문장이 1개이므로 전부 0으로 통일.
       token_type_id = [0] * max_seq_len
       assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
       assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
       assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

       input_ids.append(input_id)
       attention_masks.append(attention_mask)
       token_type_ids.append(token_type_id)
       data_labels.append(label)

  input_ids=np.array(input_ids, dtype=int)
  attention_masks=np.array(attention_masks, dtype=int)
  token_type_ids=np.array(token_type_ids, dtype=int)

  data_labels=np.asarray(data_labels, dtype=np.int32)

  return (input_ids, attention_masks, token_type_ids), data_labels

In [97]:
train_X, train_y=convert_examples_to_features(train_data['document'], train_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 149995/149995 [01:18<00:00, 1922.19it/s]


In [98]:
test_X, test_y=convert_examples_to_features(test_data['document'], test_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 49997/49997 [00:37<00:00, 1339.35it/s]


In [99]:
#max_length:128
input_id=train_X[0][0]
attention_mask=train_X[1][0]
token_type_id=train_X[2][0]
label=train_y[0]

print('단어에 대한 정수 인코딩 : ', input_id)
print('어텐션 마스크 : ', attention_mask)
print("세그먼트 인코딩 : ", token_type_id)
print('각 인코딩의 길이 : ', len(input_id))
print('정수 인코딩 복원 : ', tokenizer.decode(input_id))
print('레이블 : ', label)

단어에 대한 정수 인코딩 :  [   101   9519   9074 119005    119    119   9708 119235   9715 119230
  16439  77884  48549   9284  22333  12692    102      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0]
어텐션 마스크 :  [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [100]:
model=TFBertModel.from_pretrained('bert-base-multilingual-cased')

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [101]:
max_seq_len=128

In [103]:
input_ids_layer=tf.keras.layers.Input(shape=(max_seq_len,),dtype=tf.int32)
attention_masks_layer=tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer=tf.keras.layers.Input(shape=(max_seq_len, ),dtype=tf.int32)

outputs=model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

In [104]:
print(outputs)

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'tf_bert_model')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_bert_model')>, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)


In [105]:
print(outputs[0])

KerasTensor(type_spec=TensorSpec(shape=(None, 128, 768), dtype=tf.float32, name=None), name='tf_bert_model/bert/encoder/layer_._11/output/LayerNorm/batchnorm/add_1:0', description="created by layer 'tf_bert_model'")


In [106]:
print(outputs[1])

KerasTensor(type_spec=TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), name='tf_bert_model/bert/pooler/dense/Tanh:0', description="created by layer 'tf_bert_model'")


In [107]:
class TFBertForSequenceClassification(tf.keras.Model):
  def __init__(self, model_name):
    super(TFBertForSequenceClassification, self).__init__()
    self.bert=TFBertModel.from_pretrained(model_name, from_pt=True)
    self.classifier=tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02), activation='sigmoid', name='classifier')

  def call(self, inputs):
    input_ids, attention_mask, token_type_ids=inputs
    outputs=self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    cls_token=outputs[1]
    prediction=self.classifier(cls_token)

    return prediction

In [108]:
#TPU 작동코드
resolver=tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://'+os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)



<tensorflow.python.tpu.topology.Topology at 0x791581c61180>

In [109]:
strategy=tf.distribute.experimental.TPUStrategy(resolver)



In [110]:
with strategy.scope():
  model=TFBertForSequenceClassification('bert-base-multilingual-cased')
  optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5)
  loss=tf.keras.losses.BinaryCrossentropy()
  model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [111]:
model.fit(train_X, train_y, epochs=2, batch_size=64, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x79158a76dd80>

In [112]:
results=model.evaluate(test_X, test_y, batch_size=1024)
print('test loss, test acc: ', results)

test loss, test acc:  [0.3197639584541321, 0.8605916500091553]


In [113]:
def sentiment_predict(new_sentence):
  input_id=tokenizer.encode(new_sentence, max_length=max_seq_len, pad_to_max_length=True)

  padding_count=input_id.count(tokenizer.pad_token_id)
  attention_mask=[1]*(max_seq_len-padding_count)+[0]*padding_count
  token_type_id=[0]*max_seq_len

  input_ids=np.array([input_id])
  attention_masks=np.array([attention_mask])
  token_type_ids=np.array([token_type_id])

  encoded_input=[input_ids, attention_masks, token_type_ids]
  score=model.predict(encoded_input)[0][0]
  print(score)

  if(score>0.5):
    print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score*100))
  else:
    print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1-score)*100))


In [114]:
sentiment_predict("보던거라 계속 보고 있었는데 전개도 느리고 주인공인 은희는 한두컷 나오면서 소극적인 모습에")



0.0101059675
98.99% 확률로 부정 리뷰입니다.



In [115]:
sentiment_predict('스토리는 확실히 실망이였지만 배우들 연기력이 대박이었다. 특히 이제훈 연기는 정말... 이  배우들로 이렇게밖에 만들지 못한 영화는 아쉽지만 배우들 연기력과 사운드는 정말 빛났던 영화. 기대하고 극장에서 보면 많이 실망했겠지만 평점보고 기대없이 집에서 편하게 보면 괜찮아요. 이제훈님 연기력은 최고인 것 같습니다 ')



0.98332304
98.33% 확률로 긍정 리뷰입니다.



In [117]:
sentiment_predict('이 영화 존잼입니다.')

0.59670216
59.67% 확률로 긍정 리뷰입니다.

