# KoBERT finetuning

In [23]:
!pip install "git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf"
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-ete5pbvx/kobert-tokenizer_f83b5838a8814e61a1641001e270a090
  Running command git clone -q https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-ete5pbvx/kobert-tokenizer_f83b5838a8814e61a1641001e270a090


In [24]:
import torch
from torch import nn
from torch.utils.data import Dataset
import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [25]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

In [26]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [29]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
tok = tokenizer.tokenize
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [30]:
data = pd.read_csv('/content/drive/MyDrive/hug/ml/data/dataset2.csv', encoding='cp949')
data = data.values.tolist()

In [31]:
dataset_train, dataset_test = train_test_split(data, test_size=0.2, shuffle=True, random_state=0)

In [32]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [33]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [34]:
data_train = BERTDataset(dataset_train, 0, 1, tok, vocab, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, vocab, max_len, True, False)

In [35]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  cpuset_checked))


In [36]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

In [37]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [38]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [39]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()



In [40]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [41]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [42]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [43]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  cpuset_checked))


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.050551414489746 train acc 0.140625
epoch 1 batch id 201 loss 0.7622292041778564 train acc 0.47737873134328357
epoch 1 batch id 401 loss 0.715035617351532 train acc 0.6595620324189526
epoch 1 train acc 0.720284090909091


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 1 test acc 0.8891530797101449


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.41140857338905334 train acc 0.84375
epoch 2 batch id 201 loss 0.40085670351982117 train acc 0.9010416666666666
epoch 2 batch id 401 loss 0.5141664743423462 train acc 0.9110037406483791
epoch 2 train acc 0.9162581168831169


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 2 test acc 0.9106511512388967


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.23104357719421387 train acc 0.921875
epoch 3 batch id 201 loss 0.2594834566116333 train acc 0.933535447761194
epoch 3 batch id 401 loss 0.3553604483604431 train acc 0.9421758104738155
epoch 3 train acc 0.9458725649350649


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 3 test acc 0.9172181802244039


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.08482639491558075 train acc 0.984375
epoch 4 batch id 201 loss 0.1857251077890396 train acc 0.9587220149253731
epoch 4 batch id 401 loss 0.11627655476331711 train acc 0.9646586658354115
epoch 4 train acc 0.9671022727272728


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 4 test acc 0.923777904394577


  0%|          | 0/550 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.040723513811826706 train acc 0.984375
epoch 5 batch id 201 loss 0.12942782044410706 train acc 0.9721703980099502
epoch 5 batch id 401 loss 0.1769629269838333 train acc 0.9752961346633416
epoch 5 train acc 0.9765340909090909


  0%|          | 0/138 [00:00<?, ?it/s]

epoch 5 test acc 0.9242234981299673


In [44]:
torch.save(model.state_dict(), '/content/drive/MyDrive/hug/ml/my_path/model4.pth')