In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
cd /content/drive/MyDrive/Colab Notebooks/kochat

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFGPT2LMHeadModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

In [None]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))
print(tokenizer.decode(3))
print(tokenizer.decode(4))

In [None]:
import pandas as pd
import tqdm
import urllib.request

In [None]:
train_data = pd.read_csv('finetune.csv')

In [None]:
len(train_data)

In [None]:
batch_size = 16

In [None]:
def get_chat_data():
  for question, answer in zip(train_data.Q.to_list(), train_data.A.to_list()):
    bos_token = [tokenizer.bos_token_id]
    eos_token = [tokenizer.eos_token_id]
    sent = tokenizer.encode('<usr>' + question + '<sys>' + answer) 
    yield bos_token + sent + eos_token

In [None]:
dataset = tf.data.Dataset.from_generator(get_chat_data, output_types=tf.int32)

In [None]:
dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)

In [None]:
for batch in dataset:
    print(batch)
    break

In [None]:
tokenizer.decode(batch[0])

In [None]:
print(batch[0])

In [None]:
print(tokenizer.encode('</s><usr> 12시 땡!<sys> 하루가 또 가네요.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'))

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

In [None]:
steps = len(train_data) // batch_size + 1
print(steps)

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
  epoch_loss = 0

  for batch in tqdm.tqdm_notebook(dataset, total=steps):
      with tf.GradientTape() as tape:
          result = model(batch, labels=batch)
          loss = result[0]
          batch_loss = tf.reduce_mean(loss)
          
      grads = tape.gradient(batch_loss, model.trainable_variables)
      adam.apply_gradients(zip(grads, model.trainable_variables))
      epoch_loss += batch_loss / steps

  print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

In [None]:
text = '오늘도 좋은 하루!'

In [None]:
sent = '<usr>' + text + '<sys>'

In [None]:
input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
input_ids = tf.convert_to_tensor([input_ids])

In [None]:
output = model.generate(input_ids, max_length=20, early_stopping=True, eos_token_id=tokenizer.eos_token_id)

In [None]:
decoded_sentence = tokenizer.decode(output[0].numpy().tolist())

In [None]:
decoded_sentence.split('<sys> ')[1].replace('</s>', '')

In [None]:
output = model.generate(input_ids, max_length=20, do_sample=True, top_k=20, num_return_sequences=3)
tokenizer.decode(output[0].numpy().tolist())

In [None]:
def return_answer_by_chatbot(user_text):
  sent = '<usr>' + user_text + '<sys>'
  input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
  input_ids = tf.convert_to_tensor([input_ids])
  output = model.generate(input_ids, max_length=20, do_sample=True, repetition_penalty=1.2, num_return_sequences=2)
  sentence = tokenizer.decode(output[0].numpy().tolist())
  sentence_1 = tokenizer.decode(output[1].numpy().tolist())

  chatbot_response = sentence.split('<sys> ')[1].replace('</s>', '').replace('<pad>', '')
  chatbot_response_1 = sentence_1.split('<sys> ')[1].replace('</s>', '').replace('<pad>', '')

  return (chatbot_response, chatbot_response_1) 

In [None]:
test_list = ['나랑 영화 보자', '밥 같이 먹을래?', '좀 이따가 밥먹을래?', '너 언제와?', '내일 시간 괜찮을까요?', '커피 한 잔 할까?']

for text in test_list:
    out1, out2 = return_answer_by_chatbot(text)
    print("Input: ", text)
    print("Output 1: ", out1)
    print("Output 2: ", out2)
    print("")