### 0. 드라이브 마운트

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1. 라이브러리 설치

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch



In [None]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-eyjvrzb_/kobert-tokenizer_6e3a7be66f704f658557ba9c5d3a7397
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-eyjvrzb_/kobert-tokenizer_6e3a7be66f704f658557ba9c5d3a7397
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


### 2. 라이브러리 import

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
device = torch.device("cuda:0")
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


### 3. 데이터 전처리

In [None]:
import pandas as pd

train_source_data  = pd.read_excel("/content/drive/MyDrive/day-canvas/감성대화말뭉치(최종데이터)_Training.xlsx")
validation_source_data = pd.read_excel("/content/drive/MyDrive/day-canvas/감성대화말뭉치(최종데이터)_Validation.xlsx")


In [None]:
### 세문장 모두 사용하는 경우


# 감정 종류 : 기쁨 당황 분노 불안 상처 슬픔
# def pre_processing(data):
#   data.loc[(data['감정_대분류'] == "기쁨"), '감정_대분류'] = 0
#   data.loc[(data['감정_대분류'] == "당황"), '감정_대분류'] = 1
#   data.loc[(data['감정_대분류'] == "분노"), '감정_대분류'] = 2
#   data.loc[(data['감정_대분류'] == "불안"), '감정_대분류'] = 3
#   data.loc[(data['감정_대분류'] == "상처"), '감정_대분류'] = 4
#   data.loc[(data['감정_대분류'] == "슬픔"), '감정_대분류'] = 5

#   data = data.replace(np.nan, '', regex=True)

#   required_data = []

#   for sentence1, sentence2, sentence3, label in zip(data['사람문장1'], data['사람문장2'], data['사람문장3'], data['감정_대분류']):
#     #sentenc1, 2, 3 한 문장으로 엮고 [sentence, emotion] 형태로 엮기

#     concated_sentences = []

#     if bool(sentence1):
#       concated_sentences.append(sentence1)
#       concated_sentences.append(" ")

#     if bool(sentence2):
#       concated_sentences.append(sentence2)
#       concated_sentences.append(" ")

#     if bool(sentence3):
#       concated_sentences.append(sentence3)
#       concated_sentences.append(" ")

#     concated_sentences = ''.join(concated_sentences)

#     data = []
#     data.append(concated_sentences)
#     data.append(str(label))
#     required_data.append(data)

#   return required_data


In [None]:
### 사람문장1만 사용하는 경우


# 감정 종류 : 기쁨 당황 분노 불안 상처 슬픔
def pre_processing(data):
  data.loc[(data['감정_대분류'] == "기쁨"), '감정_대분류'] = 0
  data.loc[(data['감정_대분류'] == "당황"), '감정_대분류'] = 1
  data.loc[(data['감정_대분류'] == "분노"), '감정_대분류'] = 2
  data.loc[(data['감정_대분류'] == "불안"), '감정_대분류'] = 3
  data.loc[(data['감정_대분류'] == "상처"), '감정_대분류'] = 4
  data.loc[(data['감정_대분류'] == "슬픔"), '감정_대분류'] = 5

  data = data.replace(np.nan, '', regex=True)

  required_data = []

  for sentence1, label in zip(data['사람문장1'], data['감정_대분류']):

    data = []
    data.append(sentence1)
    data.append(str(label))
    required_data.append(data)

  return required_data


In [None]:
train_data = pre_processing(train_source_data)
validation_data = pre_processing(validation_source_data)

print(train_data[:3])
print(validation_data[:3])

[['일은 왜 해도 해도 끝이 없을까? 화가 난다.', '2'], ['이번 달에 또 급여가 깎였어! 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가 나.', '2'], ['회사에 신입이 들어왔는데 말투가 거슬려. 그런 애를 매일 봐야 한다고 생각하니까 스트레스 받아. ', '2']]
[['이번 프로젝트에서 발표를 하는데 내가 실수하는 바람에 우리 팀이 감점을 받았어. 너무 미안해.', '3'], ['회사에서 중요한 프로젝트를 혼자 하게 됐는데 솔직히 두렵고 무서워.', '3'], ['상사가 너무 무섭게 생겨서 친해지는 게 너무 두려워.', '3']]


### 4. 데이터셋 토큰화 & 파라미터 설정

In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False)

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [None]:
### 파라미터 변경 필요

max_len = 16
batch_size = 128
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
tok = tokenizer.tokenize

data_train = BERTDataset(train_data, 0, 1, tok, vocab, max_len, True, False)
data_test = BERTDataset(validation_data, 0, 1, tok, vocab, max_len, True, False)

### 5. 데이터 로더 설정 & 감정 분류 모델 선언

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size = batch_size, num_workers = 5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size = batch_size, num_workers = 5)



In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 6,   # 감정 클래스 수로 조정
                 dr_rate = None,
                 params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

### 6. loss function, optimizer 선언

In [None]:
# BERT  모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate = 0.5).to(device)

# optimizer와 schedule 설정
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]


optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate)
loss_fn = nn.CrossEntropyLoss() # 다중분류를 위한 loss function

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)

# calc_accuracy : 정확도 측정을 위한 함수
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

train_dataloader



<torch.utils.data.dataloader.DataLoader at 0x795af6397760>

### 7. Train

**(중요) 아래 두 셀 중 하나만 선택해서 실행할 것**


**체크포인트 생성 전에 사용하는 셀**

In [None]:
### 체크포인트 생성 이전에 사용하는 코드
###
#필수로 변경해야 하는 것
#1. 체크포인트 파일 저장 경로 수정
###

checkpoint_num = 1

for e in range(num_epochs):
    cost = 0.0
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))

        cost += loss
    cost = cost / len(train_dataloader)

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

    ## 여기에 조건문 추가하면 특정 epoch에만 checkpoint 파일 생성 가능

    torch.save(
        {
          "model": "bertmodel",
          "epoch": e,
          "model_state_dict": model.state_dict(),
          "optimizer_state_dict": optimizer.state_dict(),
          "cost": cost,
          "description": f"bertmodel 체크포인트-{checkpoint_num}",
        },
        f"/content/drive/MyDrive/day-canvas/kobert_checkpoint/checkpoint-{checkpoint_num}.pt",
    )
    checkpoint_num += 1


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/404 [00:00<?, ?it/s]



epoch 1 batch id 1 loss 1.8618680238723755 train acc 0.1015625
epoch 1 batch id 201 loss 1.861506462097168 train acc 0.2447139303482587
epoch 1 batch id 401 loss 1.4271984100341797 train acc 0.29143937032418954
epoch 1 train acc 0.29205902927249244


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/52 [00:00<?, ?it/s]

epoch 1 test acc 0.24055214006126618


  0%|          | 0/404 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 2.208124876022339 train acc 0.2421875
epoch 2 batch id 201 loss 1.581386685371399 train acc 0.4203591417910448
epoch 2 batch id 401 loss 1.3879663944244385 train acc 0.4245051433915212
epoch 2 train acc 0.4240938105359449


  0%|          | 0/52 [00:00<?, ?it/s]

epoch 2 test acc 0.2842415120830497


  0%|          | 0/404 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.9653984308242798 train acc 0.375
epoch 3 batch id 201 loss 1.4352538585662842 train acc 0.48433613184079605
epoch 3 batch id 401 loss 1.3146638870239258 train acc 0.48657652743142144
epoch 3 train acc 0.4867796222557038


  0%|          | 0/52 [00:00<?, ?it/s]

epoch 3 test acc 0.31790466516337645


  0%|          | 0/404 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 1.683698296546936 train acc 0.453125
epoch 4 batch id 201 loss 1.2562050819396973 train acc 0.5421719527363185
epoch 4 batch id 401 loss 1.2011502981185913 train acc 0.5357894326683291
epoch 4 train acc 0.535519465669393


  0%|          | 0/52 [00:00<?, ?it/s]

epoch 4 test acc 0.35798826795439076


  0%|          | 0/404 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 1.4651527404785156 train acc 0.5234375
epoch 5 batch id 201 loss 1.15937077999115 train acc 0.5561644900497512
epoch 5 batch id 401 loss 1.5069859027862549 train acc 0.5317175810473815
epoch 5 train acc 0.5307236735901851


  0%|          | 0/52 [00:00<?, ?it/s]

epoch 5 test acc 0.533216420609258


**체크포인트 생성 후에 사용하는 셀**

In [None]:
### 체크포인트 생성 이후에 사용하는 코드
###
#필수로 변경해야 하는 것
#1. checkpoint = torch.load('체크포인트 파일 경로') : 마지막 epoch이 돌아가고 난 뒤 저장된 checkpoint 파일 경로 수정 필요
#2. num_epochs = 10 : epochs의 숫자 수정 필요
#3. 체크포인트 파일 저장 경로 수정
###

checkpoint = torch.load('/content/drive/MyDrive/day-canvas/kobert_checkpoint/checkpoint-5.pt') #마지막 epoch이 돌아가고 난 뒤 저장된 checkpoint 파일 경로를 작성한다.
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
checkpoint_epoch = checkpoint["epoch"]
checkpoint_description = checkpoint["description"]

print(checkpoint_description)

checkpoint_num = checkpoint_epoch

num_epochs = 10 # 새로운 epochs 값 설정 필요하다.

for e in range(checkpoint_epoch + 1, num_epochs):
    cost = 0.0
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))

        cost += loss
    cost = cost / len(train_dataloader)

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

    ## 여기에 조건문 추가하면 특정 epoch에만 checkpoint 파일 생성 가능

    #체크포인트 파일 저장 경로 수정해야함
    torch.save(
        {
          "model": "bertmodel",
          "epoch": e,
          "model_state_dict": model.state_dict(),
          "optimizer_state_dict": optimizer.state_dict(),
          "cost": cost,
          "description": f"bertmodel 체크포인트-{checkpoint_num}",
        },
        f"/content/drive/MyDrive/day-canvas/kobert_checkpoint/checkpoint-{checkpoint_num}.pt",
    )
    checkpoint_num += 1


bertmodel 체크포인트-5


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/404 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 1.4722431898117065 train acc 0.5234375
epoch 8 batch id 201 loss 0.9448878765106201 train acc 0.6316464552238806
epoch 8 batch id 401 loss 1.1091481447219849 train acc 0.6554317331670823
epoch 8 train acc 0.6550930908308222


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/52 [00:00<?, ?it/s]

epoch 8 test acc 0.32819147804629


  0%|          | 0/404 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 1.5607017278671265 train acc 0.5078125
epoch 9 batch id 201 loss 0.8774580955505371 train acc 0.6902985074626866


KeyboardInterrupt: ignored

### 8. 테스트

In [None]:
def predict(predict_sentence): # input = 감정분류하고자 하는 sentence

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False) # 토큰화한 문장
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size = batch_size, num_workers = 5) # torch 형식 변환

    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length = valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval = []
        for i in out: # out = model(token_ids, valid_length, segment_ids)
            logits = i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("기쁨")
            elif np.argmax(logits) == 1:
                test_eval.append("당황")
            elif np.argmax(logits) == 2:
                test_eval.append("분노")
            elif np.argmax(logits) == 3:
                test_eval.append("불안")
            elif np.argmax(logits) == 4:
                test_eval.append("상처")
            elif np.argmax(logits) == 5:
                test_eval.append("슬픔")

        print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")

In [None]:
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0" :
        break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 안녕하세요!
>> 입력하신 내용에서 기쁨 느껴집니다.


하고싶은 말을 입력해주세요 : 똥 마려워요
>> 입력하신 내용에서 분노 느껴집니다.




KeyboardInterrupt: ignored