In [48]:
import math
import random
import numpy as np
import pandas as pd
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.core.lightning import LightningModule
from torch.utils.data import DataLoader, Dataset
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import re
from tqdm import tqdm

In [49]:
user_token = "<user>"
bot_token = "<bot>"
MASK = '<mask>'
PAD = '<pad>'

In [50]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=bot_token, eos_token=user_token, unk_token='<unk>',
            pad_token=PAD, mask_token=MASK)
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [51]:
tokenizer.model_max_length = 200

In [5]:
data = pd.read_csv('./data/multi_turn_tokens.csv') # 적정길이 200

In [7]:
for i in tqdm(range(len(data))):
    if tokenizer.model_max_length < len(tokenizer.encode(data['conversation'][i])):
        data['conversation'][i] = np.NaN
    else:
        pass

  0%|          | 0/341630 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (203 > 200). Running this sequence through the model will result in indexing errors
100%|██████████| 341630/341630 [03:33<00:00, 1602.19it/s]


In [8]:
data.isna().sum()

conversation    29961
dtype: int64

In [9]:
data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)

In [10]:
# 챗봇 데이터를 처리하는 클래스를 만든다.
class ChatbotDataset(Dataset):
    def __init__(self, chats, tokenizer):  # 데이터셋의 전처리를 해주는 부분
        self._data = chats
        self.tokenizer = tokenizer

    def __len__(self):  # chatbotdata 의 길이를 리턴한다.
        return len(self._data)

    def __getitem__(self, idx):  # 로드한 챗봇 데이터를 차례차례 DataLoader로 넘겨주는 메서드
        token = self._data['conversation'][idx]
        turn = tokenizer.tokenize(token)
        turn = tokenizer.convert_tokens_to_ids(turn)
        
        token_ids = self.tokenizer.encode(token, padding='max_length')
        
        mask = []
        labels = []
        for i in turn:
            if i == tokenizer.eos_token_id: loop = True
            if i == tokenizer.sep_token_id: loop = False
            if loop == True: 
                mask += [0]
            if loop == False:
                mask += [1]
                
        for i,v in enumerate(turn):
            if i == 0 and v == tokenizer.eos_token_id:
                labels.append(tokenizer.mask_token)
                continue
            if v == tokenizer.eos_token_id: loop = True
            if v == tokenizer.sep_token_id: loop = False
            if loop == False:
                labels.append(tokenizer.convert_ids_to_tokens(v))
            if loop == True and v == tokenizer.eos_token_id:
                labels.append(tokenizer.eos_token)
            elif loop == True:
                labels.append(tokenizer.mask_token)
        
        labels_tokens = ""
        for i in labels: labels_tokens += i
        loop = True
        while loop:
            if len(mask) < tokenizer.model_max_length:
                mask.append(tokenizer.pad_token_id)
            else: loop = False
                
#         mask = torch.LongTensor(mask)
#         mask = mask.view(1, tokenizer.model_max_length)
        
        labels_ids = self.tokenizer.encode(labels_tokens, padding='max_length')
        return token_ids, mask, labels_ids

def collate_batch(batch):
    data =  [item[0] for item in batch]
    mask =  [item[1] for item in batch]
    label = [item[2] for item in batch]
    return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)

In [11]:
train_set = ChatbotDataset(data, tokenizer)
#윈도우 환경에서 num_workers 는 무조건 0으로 지정, 리눅스에서는 2
train_dataloader = DataLoader(train_set, batch_size=8, num_workers=2, shuffle=True, collate_fn=collate_batch,)

In [12]:
print("start")
for batch_idx, samples in enumerate(train_dataloader):
    if batch_idx > 3:
        break
    token_ids, mask, label = samples
    print("token_ids ====> ", token_ids)
    print("mask =====> ", mask)
    print("label =====> ", label)
print("end")

start
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
token_ids ====>  tensor([[51200,  9893,  7991,  ...,     3,     3,     3],
        [51200, 18519,  9285,  ...,     3,     3,     3],
        [51200, 10489,  9190,  ...,     3,     3,     3],
        ...,
        [51200,  9664,  7293,  ...,     3,     3,     3],
        [51200,  9114,  7979,  ...,     3,     3,     3],
        [51200,  9769,  9497,  ...,     3,     3,     3]])
mask =====>  tensor([[0, 0, 0,  ..., 3, 3, 3],
  

In [90]:
learning_rate = 3e-5
criterion = torch.nn.CrossEntropyLoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

epochs = 1
Sneg = -1e18

In [82]:
model.set_input_embeddings(torch.nn.Embedding(51201, 768))
model.set_output_embeddings(torch.nn.Linear(768, 51201, bias=False))

In [93]:
model.to(device)
model.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [75]:
import time
print ("start")
s_time = time.time()
for epoch in range(epochs):
    print(epoch)
    for batch_idx, samples in enumerate(train_dataloader):
        optimizer.zero_grad()
        token_ids, mask, label = samples
        token_ids = token_ids.to(device)
        mask = mask.to(device)
        label = label.to(device)
        out = model(token_ids)
        out = out.logits      #Returns a new tensor with the logit of the elements of input
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
        loss = criterion(mask_out.transpose(2, 1), label)
        # 평균 loss 만들기 avg_loss[0] / avg_loss[1] <- loss 정규화
        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()
        # 학습 끝
        optimizer.step()
    e_time = time.time()
    print(f'{epoch}번째 학습까지 걸린시간 {e_time-s_time}')
print ("end")

start
0
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
tokenizer.all_special_tokens_extended