# 1. 준비과정

## 1.1 패키지 임포트

In [34]:
from random import randint
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook, trange

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import sys

## 1.2 깃허브 클론 

In [2]:
!git clone -q https://****@github.com/SKTBrain/KoBERT.git

fatal: destination path 'KoBERT' already exists and is not an empty directory.


In [3]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [4]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [5]:
device = torch.device("cuda:0")

# 2. 데이터셋 구축

## 2.1 데이터 형태 포맷팅
- 이어지는 경우 : [본문장, 다음문장, 1]
- 안 이어지는 경우 : [본문장, 랜덤문장, 0]

In [6]:
from util.data_setter import make_trainset as ready

In [7]:
datapath = './data/정제/sentence_data.txt'
datalist = ready(datapath)

In [8]:
datalist[31]

[' 또한 대모가 되어달라는 요청도 업었고 말이에요.', ' 그러다가 식물 키우기 숙제 때문에 장미 모양 다육이를 샀다.', 0]

## 2.2 데이터 입력 준비

In [9]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(datalist, test_size=0.2, random_state=42, shuffle=True)
print("train shape is:", len(train))
print("test shape is:", len(test))

train shape is: 78105
test shape is: 19527


In [29]:
from util.data_setter import BERTDataset

하이퍼 파라미터

In [11]:
pre_max_len = 4096
max_len = 512
batch_size = 6
warmup_ratio = 0.1
num_epochs = 25
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

데이터 로더

In [12]:
data_train = BERTDataset(train, 0, 1, 2, tok, max_len, True, True,pre_max_len)
data_test = BERTDataset(test, 0, 1, 2, tok, max_len, True, True,pre_max_len)

100%|██████████████████████████████████████████████████████████████████████████| 78105/78105 [01:02<00:00, 1250.88it/s]
100%|██████████████████████████████████████████████████████████████████████████| 19527/19527 [00:15<00:00, 1268.49it/s]


In [24]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=4)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=4)

# 3. 모델 학습

## 3.1 모델 Load

첫 학습인 경우

In [30]:
from util.model import BERTClassifier

In [31]:
model = BERTClassifier(bertmodel, dr_rate=0.5, num_classes=2).to(device)

이어서 학습하는 경우

In [35]:
# 아래 경로에 모델 path 입력
model = torch.load('./modelCheckpoint/flowcheck_ver04.model')

ModuleNotFoundError: No module named 'transformers.modeling_bert'

## 3.2 옵티마이저

In [16]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
# loss_fn = nn.CrossEntropyLoss(weight=weight)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [17]:
from util.model import calc_accuracy as accr

## 3.3 학습

In [25]:
result = []
filepath = './modelCheckpoint/'
n_batch = len(train_dataloader)
for e in range(num_epochs):
    train_acc = 0.0
    valid_acc = 0.0
    model.train()
    print("hi")
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader, file=sys.stdout)):
        
        print("hi")
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            tqdm.write("epoch %02d batch id %04d/%d loss %f train acc %f" %(e+1, batch_id+1, n_batch, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
        if batch_id % 1000 == 0:
            print("\nSaving model....\n")
            torch.save(model, filepath+'epoch_%d_batch_%d_acc_%.2f.model' %(e+1,batch_id+1,train_acc/(batch_id+1)*100))

hi
  0%|                                                                                        | 0/13018 [00:23<?, ?it/s]


KeyboardInterrupt: 

# 4. 모델 평가

In [37]:
tes = [['공주가 갈 곳이 없어 길을 잃었어요', '토끼를 잡아먹은 건 여우였어요',1]]
tes_data = BERTDataset(tes, 0, 1, 2, tok, max_len, True, True,pre_max_len)
tes_dataloader = torch.utils.data.DataLoader(tes_data, batch_size=batch_size, num_workers=4)

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 248.11it/s]


In [None]:
res = [] 
ans = []
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader, file=sys.stdout)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)
    logits = out[0]
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits)
    res.append(result)
    ans.append(label.detach().cpu().numpy()[0])
    if result == label.detach().cpu().numpy()[0]:
        print("정답이 맞았습니다.")
    else:
        print("정답이 틀렸습니다.")