In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import re
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
class cfg:
    user_token = "<usr>"
    bot_token = "<sys>"
    bos_token = '<s>'
    eos_token = '</s>'
    mask_token = '<mask>'
    pad_token = '<pad>'
    unk_token = '<unk>'
    max_len = 128
    max_turns = 6
    epochs = 10
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    learning_rate = 1e-4
    model_name = "skt/kogpt2-base-v2"

tokenizer = PreTrainedTokenizerFast.from_pretrained(cfg.model_name,
            bos_token=cfg.bos_token, eos_token=cfg.eos_token, unk_token=cfg.unk_token,
            pad_token=cfg.pad_token, mask_token=cfg.mask_token, model_max_length = cfg.max_len)

model = GPT2LMHeadModel.from_pretrained(cfg.model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [3]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, cfg):
        self.data = data
        self.tokenizer = tokenizer
        self.user_token = cfg.user_token
        self.bot_token = cfg.bot_token
        self.bos_token = cfg.bos_token
        self.eos_token = cfg.eos_token
        self.mask_token = cfg.mask_token
        self.pad_token = cfg.pad_token
        self.unk_token = cfg.unk_token
        self.max_len = cfg.max_len
        self.max_turns = cfg.max_turns
        
    def __len__(self):  # chatbotdata 의 길이를 리턴한다.
        return len(self.data)
        
    def __getitem__(self, idx):
        _data = self.data['conversation'][idx]
        masking_data = self.tokenizer.encode(_data)
        token_ids = self.tokenizer.encode(self.bos_token+_data+self.eos_token, padding='max_length')
        label = token_ids
        token_type_ids = ""
        attention_mask = []
        for mask in masking_data:
            mask = tokenizer.decode(mask)
            if mask == self.user_token: loop=True
            if mask == self.bot_token: loop=False
            if loop==True:
                token_type_ids += self.user_token
                attention_mask += [1]
            if loop==False:
                token_type_ids += self.bot_token
                attention_mask += [1]
        
        token_type_ids = self.tokenizer.encode(token_type_ids, padding='max_length')
        
        # padding
        while True:
            if len(attention_mask) >= self.max_len:
                break
            attention_mask += self.tokenizer.encode(self.pad_token)
        
        return token_ids, token_type_ids, attention_mask, label
    
def collate_batch(batch):
    token_ids = [item[0] for item in batch]
    token_type_ids = [item[1] for item in batch]
    attetion_mask = [item[2] for item in batch]
    label = [item[3] for item in batch]
    return torch.LongTensor(token_ids), torch.LongTensor(token_type_ids), torch.LongTensor(attetion_mask), torch.LongTensor(label)    

In [4]:
data = pd.read_csv('./data/Multi_turn_len128.csv')

In [5]:
div_len = int(len(data)*0.95)
train_data = data[:div_len]
validation_data = data[div_len:]

validation_data.reset_index(drop=True, inplace=True)

In [6]:
train_set = CustomDataset(train_data, tokenizer, cfg)
validation_set = CustomDataset(validation_data, tokenizer, cfg)

#윈도우 환경에서 num_workers 는 무조건 0으로 지정, 리눅스에서는 2
train_dataloader = DataLoader(train_set, batch_size=8, num_workers=2, shuffle=True, collate_fn=collate_batch)
validation_dataloader = DataLoader(validation_set, batch_size=8, num_workers=2, shuffle=True, collate_fn=collate_batch)

In [7]:
for batch_idx, samples in enumerate(train_dataloader):
    if batch_idx > 1:
        break
    token_ids, token_type_ids, attetion_mask, label = samples
    print("token_ids ====> ", token_ids)
    print("token_type_ids =====> ", token_type_ids)
    print("attetion_mask =====> ", attetion_mask)    
    print("label =====> ", label)

token_ids ====>  tensor([[    0,     2, 10156,  ...,     3,     3,     3],
        [    0,     2,  9893,  ...,     3,     3,     3],
        [    0,     2, 10215,  ...,     3,     3,     3],
        ...,
        [    0,     2,   739,  ...,     3,     3,     3],
        [    0,     2, 18466,  ...,     3,     3,     3],
        [    0,     2,  9893,  ...,     3,     3,     3]])
token_type_ids =====>  tensor([[2, 2, 2,  ..., 3, 3, 3],
        [2, 2, 2,  ..., 3, 3, 3],
        [2, 2, 2,  ..., 3, 3, 3],
        ...,
        [2, 2, 2,  ..., 3, 3, 3],
        [2, 2, 2,  ..., 3, 3, 3],
        [2, 2, 2,  ..., 3, 3, 3]])
attetion_mask =====>  tensor([[1, 1, 1,  ..., 3, 3, 3],
        [1, 1, 1,  ..., 3, 3, 3],
        [1, 1, 1,  ..., 3, 3, 3],
        ...,
        [1, 1, 1,  ..., 3, 3, 3],
        [1, 1, 1,  ..., 3, 3, 3],
        [1, 1, 1,  ..., 3, 3, 3]])
label =====>  tensor([[    0,     2, 10156,  ...,     3,     3,     3],
        [    0,     2,  9893,  ...,     3,     3,     3],
        [ 

In [8]:
class ChatBot:
    """
        __init__ : 챗봇 모델 생성
            Args : model, tokenizer, cfg
        
        train : 모델 학습 진행
            Args : epochs, train_dataloader, (validation_dataloader), (save)
        
        load_model : 모델 불러오기
            Args : PATH
        
        save_model : 모델 저장하기
            Args : PATH
        
        talk : 챗봇 대화하기
            대화 종료 멘트 : quit
    """
    
    def __init__(self, model, tokenizer, cfg):
        """
            Args : model, tokenizer, cfg
        """
        self.model = model
        self.tokenizer = tokenizer
        self.device = cfg.device
        self.name = cfg.model_name
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.9)
        
        self.user_token_id = tokenizer.get_vocab()[cfg.user_token]
        self.bot_token_id = tokenizer.get_vocab()[cfg.bot_token]
        self.max_len = cfg.max_len
        self.max_turns = cfg.max_turns
        
        self.losses = []
        self.val_losses = []
    
    def train(self, epochs, train_dataloader, validation_dataloader=None, save=None):
        """
            epochs, train_dataloader, validation_dataloader=None, save=None
            save : epoch마다 모델을 저장할 경로/파일명
        """
        self.model.to(self.device)
        for epoch in range(epochs):
            self.model.train()
            print(f"\n Epoch {epoch+1}/{epochs}", sep="\n")
            start_time = time.time()
            batch_loss = []

            for i, batch in enumerate(train_dataloader):
                input_ids, token_type_ids, attention_mask, labels = batch        
                input_ids, token_type_ids, attention_mask, labels = input_ids.to(self.device), token_type_ids.to(self.device), \
                                                                    attention_mask.to(self.device) ,labels.to(self.device)
                outputs = self.model(
                    input_ids = input_ids,
                    token_type_ids = token_type_ids,
                    attention_mask = attention_mask,
                    labels = labels
                )
                
                loss = outputs.loss
                
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                batch_loss.append(loss.item())
                
                print(self.status(i+1, len(train_dataloader), time.time()-start_time, np.mean(batch_loss)), end='\r')
            
            self.scheduler.step()
            
            self.losses.append(np.mean(batch_loss))
            
            if validation_dataloader:
                val_loss = self.validation(validation_dataloader)
                print(self.status(i+1, len(train_dataloader), time.time()-start_time, np.mean(batch_loss)) + \
                      " | val_loss : %.6f"%(val_loss), end='\r')
                self.val_losses.append(val_loss)
            
            if save:
                PATH = f'{save}_epochs-{epoch+1}_loss-{np.mean(batch_loss)}.pth'
                torch.save(self.model.state_dict(), PATH)
                
    def validation(self, validation_dataloader):
        self.model.eval()
        batch_loss = []
        
        with torch.no_grad():
            for i, batch in enumerate(validation_dataloader):
                input_ids, token_type_ids, attention_mask, labels = batch
                input_ids, token_type_ids, attention_mask, labels = input_ids.to(self.device), token_type_ids.to(self.device), \
                                                                    attention_mask(self.device) ,labels.to(self.device)
                
                outputs = self.model(
                    input_ids = input_ids,
                    token_type_ids = token_type_ids,
                    attention_mask = attention_mask,
                    labels = labels
                )
                
                loss = outputs.loss
                batch_loss.append(loss.item())
            
            valid_loss = np.mean(batch_loss)
        
        return valid_loss

    @staticmethod
    def status(step, step_len, time, loss):
        return "step : %d/%d - %ds | loss : %.6f | %.2fit/s"%(
            step,
            step_len,
            int(time),
            loss,
            step/time
        )
    
    def load_model(self, PATH):
        """
            PATH : pth 파일이 저장된 경로
        """
        self.model.load_state_dict(torch.load(PATH))
        print("model loaded.")
    
    def save_model(self, PATH=None):
        """
            PATH : 저장할 파일 경로/이름, 생략시 모델 이름과 현재 시간을 파일명으로 지정함
        """
        if not PATH:
            name = self.name.replace("/", "-")
            PATH = f"./{name}_{time.strftime('%Y-%m-%d %H:%M:%S')}.pth"
        torch.save(self.model.state_dict(), PATH)
        print("model saved.")

In [9]:
training_set = ChatBot(model, tokenizer, cfg)

In [11]:
training_set.train(cfg.epochs, train_dataloader, validation_dataloader, save='./check_point/')