In [None]:
from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token = '</s>', eos_token = '</s>', pad_token = '<pad>')
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

In [None]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))
print(tokenizer.decode(3))
print(tokenizer.decode(4))

In [None]:
import urllib.request
import pandas as pd

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename = 'ChatbotData.csv')

In [None]:
data = pd.read_csv('ChatbotData.csv')
data.head()

In [None]:
print(len(data))

In [None]:
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
batch_size = 32

class ChatDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    question = self.data.Q.iloc[idx]
    answer = self.data.A.iloc[idx]

    bos_token = self.tokenizer.bos_token_id
    eos_token = self.tokenizer.eos_token_id

    sent = self.tokenizer.encode('' + question + '' + answer, add_special_tokens = False)
    return torch.tensor([bos_token] + sent + [eos_token], dtype = torch.long)

def collate_fn(batch):
  return torch.nn.utils.rnn.pad_sequence(batch, batch_first = True, padding_value = tokenizer.pad_token_id)

In [None]:
batch_size = 32
chat_dataset = ChatDataset(data, tokenizer)
data_loader = DataLoader(chat_dataset, batch_size = batch_size, collate_fn = collate_fn)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-5, eps = 1e-08)

steps = len(data_loader) // batch_size + 1
print(steps)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
epochs = 3

for epoch in range(0, epochs):
  epoch_loss = 0

  for batch in tqdm(data_loader, desc = f'Epoch {epoch + 1} / {epochs}'):
    batch = batch.to(device)

    labels = batch.clone()
    optimizer.zero_grad()

    outputs = model(input_ids = batch, labels = labels)
    loss = outputs.loss
    batch_loss = loss.mean()

    batch_loss.backward()
    optimizer.step()

    epoch_loss += batch_loss.item()

  print('[Epoch:{:>4}] cost = {:>.9}'.format(epoch+1, epoch_loss))

In [None]:
def return_answer(user_text):
  sent = '' + user_text + ''
  input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent, add_special_tokens = False)
  input_ids = torch.tensor([input_ids], dtype = torch.long).to(device)
  output = model.generate(input_ids, max_length = 50, do_sample = True, top_k = 2)
  sentence = tokenizer.decode(output[0].tolist())
  chatbot = sentence.split(' ')[1].replace('', '')
  return chatbot

In [None]:
return_answer('안녕!반가워~')