# KoBERT - Training 정치 도메인

## 1. Library Import

In [7]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

##GPU 사용 시
device = torch.device("cuda:0")
device

device(type='cuda', index=0)

## 2. Get Pretrained-Model / Tokenizer

In [8]:
# bert model
# vocab - Vocab(size=8002, unk="[UNK]", reserved="['[CLS]', '[SEP]', '[MASK]', '[PAD]']")
bertmodel, vocab = get_pytorch_kobert_model()

tokenizer = get_tokenizer() # str => /home/neuralworks/kobert/kobert_news_wiki_ko_cased-1087f8699e.spiece
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

# tok('안녕하세요. 저는 김선민 입니다')

using cached model
using cached model
using cached model


## 3. 데이터 Processing 
- xlsx -> to txt

In [9]:
import pandas as pd
data = pd.read_excel('data/Youtube댓글정리_httpsyoutu.belm7p--F7eF0_이재명_모든경기도민에_재난지원금_정리완료.xlsx')
data = data.iloc[:-4]

# data one hot encoder
ls = [{'positive':2,'neutral':1, 'negative':0}[i] for i in data['결과']]
data['결과_인코더'] = ls

# split train-test 7:3
size = int(len(data)*0.7)
print(size)
train_policy = data[['댓글','결과_인코더']].iloc[:size]
test_policy = data[['댓글','결과_인코더']].iloc[size:]
print(f'train_data {train_policy.shape}, test_data {test_policy.shape}')

# DataFrame 이나 Serises 를 txt 파일로 깔끔하게 바꿀경우 (이건 tsv)
train_policy.to_csv('data/youtube_train_policy1.txt', index=False, header=None, sep="\t")
test_policy.to_csv('data/youtube_test_policy1.txt', index=False, header=None, sep="\t")
print('train, test txt파일 저장완료')

#train_policy['결과_인코더'].value_counts() # 233, 150, 45
#test_policy['결과_인코더'].value_counts() # 131, 35, 18

428
train_data (428, 2), test_data (184, 2)
train, test txt파일 저장완료


In [10]:
data['결과'].value_counts()

negative    364
positive    185
neutral      63
Name: 결과, dtype: int64

## 4. Get Dataset
- txt 파일 TSV로 읽어들이기.

In [11]:

# field indices=[1,2] 란 => [텍스트, 0] 값 지정한 것
# num_discard_samples은 tsv 데이터 상위 몇 개의 row를 제거할지
dataset_train = nlp.data.TSVDataset("data/youtube_train_policy1.txt")
dataset_test = nlp.data.TSVDataset("data/youtube_test_policy1.txt")

max_len = 64 # 텍스트 데이터 최대 길이
batch_size = 4

In [12]:
class BERTDataset(Dataset):
    # dataset, 0, 1, tokenizer, max_len, True, False
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

## 5. Modeling, Hyper Parameter

In [13]:
## Setting parameters
warmup_ratio = 0.1
num_epochs = 1
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [14]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3, # 긍정 or 부정 or 중립
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
    
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [15]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# optimizer, loss
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

# warmup scheduler
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# calc accuracy
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

## 6. Training

In [16]:
for e in range(num_epochs):
    train_acc, test_acc = 0.0, 0.0
    # train
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    # evaluate
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    torch.save(model, 'model/kobert-policy-'+str(e)+'.pt')
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/107 [00:00<?, ?it/s]

The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
epoch 1 batch id 1 loss 0.9849988222122192 train acc 0.25
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
epoch 1 train acc 0.5794392523364486


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/46 [00:00<?, ?it/s]

The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
The current process just got forked. Disabling parallelism to avoid deadlocks...
epoch 1 test acc 0.7119565217391305
