# KoBERT finetuning

In [1]:
!pip install "git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf"
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-srck1rw8/kobert-tokenizer_667866fefe884323b7b77f5ca016291b
  Running command git clone -q https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-srck1rw8/kobert-tokenizer_667866fefe884323b7b77f5ca016291b
Building wheels for collected packages: kobert-tokenizer
  Building wheel for kobert-tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert-tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4649 sha256=2c50b018ad2a3d689b55de6179210525a29e4bed5b10edbc097ea2ab601dd6ff
  Stored in directory: /tmp/pip-ephem-wheel-cache-te39gq_0/wheels/10/b4/d9/cb627bbfaefa266657b0b4e8127f7bf96d27376fa1a23897b4
Successfully built kobert-tokenizer
Installing collected packages: kobert-tokenizer
Successfully installed kobert-tokenizer-0.1
Looking in indexes: https://pypi.org/simple, https:

In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset
import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [3]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

In [4]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
tok = tokenizer.tokenize
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

Downloading:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/432 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


Downloading:   0%|          | 0.00/535 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/352M [00:00<?, ?B/s]

In [8]:
bertmodel.embeddings.word_embeddings

Embedding(8002, 768, padding_idx=1)

In [9]:
data = pd.read_csv('/content/drive/MyDrive/hug/ml/data/dataset10.csv', encoding='cp949')
data = data.values.tolist()

In [10]:
dataset_train, dataset_test = train_test_split(data, test_size=0.2, shuffle=True, random_state=0)

In [11]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [12]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [13]:
data_train = BERTDataset(dataset_train, 0, 1, tok, vocab, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, vocab, max_len, True, False)

In [14]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=4)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=4)

In [15]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

In [16]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [17]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [18]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()



In [19]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [20]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [21]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [22]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/550 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.051372766494751 train acc 0.125
epoch 1 batch id 201 loss 0.8457270860671997 train acc 0.4518812189054726
epoch 1 batch id 401 loss 0.8845440745353699 train acc 0.5801511845386533
epoch 1 train acc 0.6214375


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 1 test acc 0.746863258185722


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.8513035178184509 train acc 0.71875
epoch 2 batch id 201 loss 0.5440534353256226 train acc 0.7534203980099502
epoch 2 batch id 401 loss 0.8356005549430847 train acc 0.7632481296758105
epoch 2 train acc 0.7692840909090909


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 2 test acc 0.758978294417606


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.6437979936599731 train acc 0.78125
epoch 3 batch id 201 loss 0.5008636116981506 train acc 0.8003731343283582
epoch 3 batch id 401 loss 0.6153915524482727 train acc 0.8087203865336658
epoch 3 train acc 0.8132613636363636


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 3 test acc 0.7669459541062802


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.4835430085659027 train acc 0.78125
epoch 4 batch id 201 loss 0.4307602047920227 train acc 0.8385416666666666
epoch 4 batch id 401 loss 0.4799864590167999 train acc 0.8491271820448878
epoch 4 train acc 0.8529147727272727


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 4 test acc 0.7692104468599034


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.3731677830219269 train acc 0.84375
epoch 5 batch id 201 loss 0.30343449115753174 train acc 0.8665267412935324
epoch 5 batch id 401 loss 0.4212777614593506 train acc 0.8735972568578554
epoch 5 train acc 0.8757443181818182


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 5 test acc 0.7716300993022008


In [23]:
torch.save(model.state_dict(), '/content/drive/MyDrive/hug/ml/my_path/model10.pth')