<a href="https://colab.research.google.com/github/SeohyunLyoo/Study/blob/main/Chatbot_KoGPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import tensorflow as tf

from transformers import AutoTokenizer, TFGPT2LMHeadModel



# **1. AutoTokenizer & TFGPT2LMHeadModel**
*   AutoTokenizer.from_pretrained : 해당 모델이 학습되었을 시점의 Tokenizer 반환
*   TFGP2LMHeadModel.from_pretrained : 두 개의 문장을 전달, 서로 연관이 있는 문장 관계 여부 판단



In [4]:
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model     = TFGPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2", from_pt=True)

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.3.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'lm_head.weight', 'transformer.h.7.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.11.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica



*  KoGPT-2에 정의된 각 Token들의 정수 값



In [6]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.unk_token_id)
print(tokenizer.mask_token_id)

1
1
3
51200
None


# **2. Chatbot 데이터 Load**

In [7]:
import pandas as pd
import tqdm
import urllib.request

In [8]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')
print('챗봇 데이터의 개수 :', len(train_data))

챗봇 데이터의 개수 : 11823


# **3. Data 전처리**

In [20]:
def get_chat_data():
    for question, answer in zip(train_data['Q'].to_list(), train_data['A'].to_list()):
        bos_token = [tokenizer.bos_token_id]
        eos_token = [tokenizer.eos_token_id]
        sentence = tokenizer.encode('<usr>' + question + '<sys>' + answer)
        yield bos_token + sentence + eos_token

In [21]:
BATCH_SIZE  = 32

dataset = tf.data.Dataset.from_generator(get_chat_data, output_types=tf.int32)

# BATCH_SIZE인 32개씩 데이터를 묶으면서 Padding Token으로 tokenizer.pad_token_id 사용
dataset = dataset.padded_batch(batch_size=BATCH_SIZE, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)

In [22]:
for batch in dataset:
    print(batch)
    break

tf.Tensor(
[[    1     2  9349  7888   739  7318   376     4 12557  6824  9108  9028
   7098 25856     1     3     3     3     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9020  8263  7497 10192 11615  8210  8006     4 12422  8711
   9535  7483 12521     1     3     3     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9085  7597   395  8149 10624  7397 24224 13358  7182     4
  12079  8135 16899  9677  8234   389     1     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9085  7597   395  8149  9465 10624  7397 24224 13358  7182
      4 12079  8135 16899  9677  8234   389     1     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9943   422   418  9327  8702  7098     4  9847 16912 18328
   8671  7415  8263  8234   389     1     3     3     3     3     3     3
      3     3     3     3     3     3]
 [    1     2  9815   410 21249 10174  6824  8210  800

In [24]:
batch[0], tokenizer.decode(batch[0])

(<tf.Tensor: shape=(30,), dtype=int32, numpy=
 array([    1,     2,  9349,  7888,   739,  7318,   376,     4, 12557,
         6824,  9108,  9028,  7098, 25856,     1,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3], dtype=int32)>,
 '</s><usr> 12시 땡!<sys> 하루가 또 가네요.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>')

# **4. Chatbot 학습**

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

# 하나의 epoch에서 실행되는 학습 횟수 설정
steps = len(train_data) // BATCH_SIZE + 1

EPOCHS = 3

for epoch in range(EPOCHS):
    epoch_loss = 0

    for batch in tqdm.tqdm(dataset, total=steps):
        with tf.GradientTape() as tape:
            result  = model(batch, labels=batch)
            loss    = result[0]
            batch_loss = tf.reduce_mean(loss)

        grads = tape.gradient(batch_loss, model.trainable_variables)
        adam.apply_gradients(zip(grads, model.trainable_variables))
        epoch_loss += batch_loss

    print('EPOCH: {}, LOSS: {}'.format(epoch, epoch_loss))


# **5. Chatbot 실행**