chatbot_final_model.pth  
chatbot_model.pth  
chatbot_model2.pth

In [2]:
import math
import numpy as np
import pandas as pd
import random
import re
import torch
import urllib.request
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast
import urllib.request
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 데이터 확인
rawal_data = pd.read_csv('./data/rawal_data.csv')
rawal_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20938 entries, 0 to 20937
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   field     20938 non-null  object
 1   question  20938 non-null  object
 2   answer    20938 non-null  object
dtypes: object(3)
memory usage: 490.9+ KB


In [4]:
# 학습용 데이터셋 - 80%
train_data = rawal_data.sample(frac=0.8, random_state=2021)
# 테스트용 데이터셋 - 20%
test_data = rawal_data.drop(train_data.index)

In [5]:
max_length = rawal_data['question'].apply(len).max()
print("The maximum length of the question is:", max_length)
min_length = rawal_data['question'].apply(len).min()
print("The minimum length of the question is:", min_length)
max_length_answer = rawal_data['answer'].apply(len).max()
print("The maximum length of the answer is:", max_length_answer)
min_length_answer = rawal_data['answer'].apply(len).min()
print("The minimum length of the answer is:", min_length_answer)

The maximum length of the question is: 5134
The minimum length of the question is: 5
The maximum length of the answer is: 3935
The minimum length of the answer is: 1


In [6]:
empty_rows = rawal_data[rawal_data['answer'] == '']
if empty_rows.empty:
    print("There are no rows with an empty answer.")
else:
    print("There are rows with an empty answer.")

There are no rows with an empty answer.


In [7]:
# 스페셜 토큰 정의
Q_TKN = "<usr>"
A_TKN = "<sys>"
BOS = "</s>"
EOS = "</s>"
SENT = '<unused1>'
PAD = "<pad>"
MASK = "<unused0>"
# 사전학습된 토크나이저
TOKENIZER = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2", bos_token=BOS, eos_token=EOS, unk_token="<unk>", pad_token=PAD, mask_token=MASK,)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [12]:
class ChatbotDataset(Dataset):
    def __init__(self, DF):
        self.data = DF
        self.questions = DF['question'].values
        self.answers = DF['answer'].values
        self.tokenizer = TOKENIZER
        self.q_token = Q_TKN
        self.a_token = A_TKN
        self.sent_token = SENT
        self.bos_token = BOS
        self.eos_token = EOS
        self.mask_token = MASK

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        tokenizer = self.tokenizer

        q_tokenized = tokenizer.tokenize(self.q_token)
        a_tokenized = tokenizer.tokenize(self.a_token)
        sent_tokenized = tokenizer.tokenize(self.sent_token)
        bos_tokenized = tokenizer.tokenize(self.bos_token)
        eos_tokenized = tokenizer.tokenize(self.eos_token)

        question = tokenizer.tokenize(question)
        answer = tokenizer.tokenize(answer)

        q_input = q_tokenized + question + sent_tokenized + a_tokenized
        a_input = bos_tokenized + answer
        a_output = answer + eos_tokenized

        q_input = tokenizer.convert_tokens_to_ids(q_input)
        a_input = tokenizer.convert_tokens_to_ids(a_input)
        a_output = tokenizer.convert_tokens_to_ids(a_output)

        return torch.tensor(q_input),torch.tensor(a_input),torch.tensor(a_output)

In [13]:
# 가장 긴문장을 기준으로 zero-padding
def collate_fn(batch):
    q_input, a_input, a_output = zip(*batch)
    q_input = torch.nn.utils.rnn.pad_sequence(q_input, batch_first=True, padding_value=TOKENIZER.pad_token_id)
    a_input = torch.nn.utils.rnn.pad_sequence(a_input, batch_first=True, padding_value=TOKENIZER.pad_token_id)
    a_output = torch.nn.utils.rnn.pad_sequence(a_output, batch_first=True, padding_value=TOKENIZER.pad_token_id)

    return q_input, a_input, a_output

In [14]:
# 학습용 데이터셋 생성
train_dataset = ChatbotDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
# 테스트용 데이터셋 생성
test_dataset = ChatbotDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [15]:
# 베치크기 확인
for q_input, a_input, a_output in train_loader:
    print(q_input.size(), a_input.size(), a_output.size())
    break
for q_input, a_input, a_output in test_loader:
    print(q_input.size(), a_input.size(), a_output.size())
    break

torch.Size([32, 1843]) torch.Size([32, 695]) torch.Size([32, 695])
torch.Size([32, 144]) torch.Size([32, 802]) torch.Size([32, 802])


In [16]:
# 모델 생성
MODEL = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL.to(DEVICE)
CRITERION = torch.nn.CrossEntropyLoss(ignore_index=TOKENIZER.pad_token_id)
OPTIMIZER = torch.optim.Adam(MODEL.parameters(), lr=1e-3)
EPOCHS = 100
SCHEDULER = ReduceLROnPlateau(OPTIMIZER, mode='min', patience=5)

In [17]:
def training(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for i, (q_input, a_input, a_output) in enumerate(iterator):
        q_input = q_input.to(device)
        a_input = a_input.to(device)
        a_output = a_output.to(device)
        optimizer.zero_grad()
        output = model(input_ids=q_input, labels=a_output, return_dict=True)
        loss = output.loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [18]:
def testing(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (q_input, a_input, a_output) in enumerate(iterator):
            q_input = q_input.to(device)
            a_input = a_input.to(device)
            a_output = a_output.to(device)
            output = model(input_ids=q_input, labels=a_output, return_dict=True)
            loss = output.loss
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [19]:
train_loss = []
test_loss = []
for epoch in range(EPOCHS):
    train_loss = training(MODEL, train_loader, OPTIMIZER, CRITERION, DEVICE)
    test_loss = testing(MODEL, test_loader, CRITERION, DEVICE)
    SCHEDULER.step(test_loss)
    train_loss.append(train_loss)
    test_loss.append(test_loss)
    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.3f}")
    print(f"\tTest Loss: {test_loss:.3f}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
