In [1]:
!pip install gluonnlp pandas tqdm
!pip install mxnet
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install mxnet-mkl==1.6.0 numpy==1.23.1
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-_g_t753l/kobert-tokenizer_5f3a5df785a846a189d0aba0d125f10f
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-_g_t753l/kobert-tokenizer_5f3a5df785a846a189d0aba0d125f10f
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kobert_tokenizer
  Building wheel for kobert_tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert_tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4633 sha256=342c6b7161380173af2a30f195a167318016a38c41b0b06132c7e65ff5fc9d74
  Stored in directory: /tmp/pip-ephem-wheel-cache-mi6v7nyx/wheels/e9/1a/3f/a864970e8a169c176befa3c4a1e07aa612f69195907a4045fe
Successfully built kobert_tokenizer
Installing collected packages: kobert_tokenizer
Successfully ins

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook

In [3]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split

In [4]:
device = torch.device("cuda:0")

In [11]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [17]:
train = pd.read_csv('/content/drive/MyDrive/train_data.csv')
test = pd.read_csv('/content/drive/MyDrive/test_data.csv')

train = train.astype(str)
test = test.astype(str)

In [18]:
emotion_mapping = {
    "불안": 0,
    "슬픔": 1,
    "당황": 2,
    "기쁨": 3,
    "분노": 4,
    "상처": 5
}

data_train_list = []
data_test_list = []

for index, row in train.iterrows():
    for i in range(1, 4):
        sentence_column = f'SENTENCE{i}'
        data_point = []
        data_point.append(row[sentence_column])
        data_point.append(str(emotion_mapping[row['EMOTION']]))
        data_train_list.append(data_point)

for index, row in test.iterrows():
    for i in range(1, 4):
        sentence_column = f'SENTENCE{i}'
        data_point = []
        data_point.append(row[sentence_column])
        data_point.append(str(emotion_mapping[row['EMOTION']]))
        data_test_list.append(data_point)

In [19]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):

        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))


    def __len__(self):
        return (len(self.labels))


In [20]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [21]:
tok = tokenizer.tokenize
data_train = BERTDataset(data_train_list, 0, 1, tok, vocab, max_len, True, False)
data_test = BERTDataset(data_test_list, 0, 1, tok, vocab,  max_len, True, False)

In [22]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)



In [23]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 6,   ##클래스 수 조정##
                 dr_rate = None,
                 params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [24]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss() # 다중분류를 위한 대표적인 loss func

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc



In [25]:
train_history = []
test_history = []
loss_history = []

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)

        #print(label.shape,out.shape)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
            train_history.append(train_acc / (batch_id+1))
            loss_history.append(loss.data.cpu().numpy())
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    #train_history.append(train_acc / (batch_id+1))

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    test_history.append(test_acc / (batch_id+1))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/2421 [00:00<?, ?it/s]



epoch 1 batch id 1 loss 1.8033721446990967 train acc 0.1875
epoch 1 batch id 201 loss 1.778183937072754 train acc 0.1832245024875622
epoch 1 batch id 401 loss 1.4183591604232788 train acc 0.23636221945137156
epoch 1 batch id 601 loss 1.4826570749282837 train acc 0.290817387687188
epoch 1 batch id 801 loss 1.4024393558502197 train acc 0.32379447565543074
epoch 1 batch id 1001 loss 1.3409618139266968 train acc 0.3407373876123876
epoch 1 batch id 1201 loss 0.8876580595970154 train acc 0.36042880932556204
epoch 1 batch id 1401 loss 1.4101577997207642 train acc 0.36380264097073517
epoch 1 batch id 1601 loss 1.9646973609924316 train acc 0.37484384759525297
epoch 1 batch id 1801 loss 1.6549607515335083 train acc 0.3777849111604664
epoch 1 batch id 2001 loss 1.6270402669906616 train acc 0.3773113443278361
epoch 1 batch id 2201 loss 1.273194432258606 train acc 0.38574085642889594
epoch 1 batch id 2401 loss 0.7981763482093811 train acc 0.39441899208663056
epoch 1 train acc 0.39569780049566294


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/312 [00:00<?, ?it/s]

epoch 1 test acc 0.21334661774628882


  0%|          | 0/2421 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 2.1932501792907715 train acc 0.296875
epoch 2 batch id 201 loss 1.2267755270004272 train acc 0.42257462686567165
epoch 2 batch id 401 loss 1.1786028146743774 train acc 0.438824812967581
epoch 2 batch id 601 loss 1.2558658123016357 train acc 0.4504211730449251
epoch 2 batch id 801 loss 1.4250551462173462 train acc 0.45456850811485644
epoch 2 batch id 1001 loss 1.222942590713501 train acc 0.4569336913086913
epoch 2 batch id 1201 loss 0.5214970707893372 train acc 0.46588780183180684
epoch 2 batch id 1401 loss 1.411962866783142 train acc 0.4619022127052106
epoch 2 batch id 1601 loss 1.6199805736541748 train acc 0.46597829481574016
epoch 2 batch id 1801 loss 1.5520050525665283 train acc 0.46525367851193783
epoch 2 batch id 2001 loss 1.5496073961257935 train acc 0.46117566216891553
epoch 2 batch id 2201 loss 1.1457571983337402 train acc 0.46598137210358925
epoch 2 batch id 2401 loss 0.7578147053718567 train acc 0.4709951582673886
epoch 2 train acc 0.47194728418009085


  0%|          | 0/312 [00:00<?, ?it/s]

epoch 2 test acc 0.2774860829959514


  0%|          | 0/2421 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.9389736652374268 train acc 0.28125
epoch 3 batch id 201 loss 1.1611016988754272 train acc 0.47699004975124376
epoch 3 batch id 401 loss 0.993028998374939 train acc 0.48885598503740646
epoch 3 batch id 601 loss 1.1166234016418457 train acc 0.5007539517470881
epoch 3 batch id 801 loss 1.396532416343689 train acc 0.5039794007490637
epoch 3 batch id 1001 loss 1.1503820419311523 train acc 0.5063061938061938
epoch 3 batch id 1201 loss 0.36639174818992615 train acc 0.5132311615320566
epoch 3 batch id 1401 loss 1.3037829399108887 train acc 0.5085876159885796
epoch 3 batch id 1601 loss 1.471816062927246 train acc 0.5122677232979388
epoch 3 batch id 1801 loss 1.342246174812317 train acc 0.5132912270960578
epoch 3 batch id 2001 loss 1.397107720375061 train acc 0.5102058345827086
epoch 3 batch id 2201 loss 1.0300010442733765 train acc 0.5152416515220355
epoch 3 batch id 2401 loss 0.67333984375 train acc 0.5205448250728864
epoch 3 train acc 0.5214838909541512


  0%|          | 0/312 [00:00<?, ?it/s]

epoch 3 test acc 0.29631093960863697


  0%|          | 0/2421 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 1.796980619430542 train acc 0.390625
epoch 4 batch id 201 loss 1.0148851871490479 train acc 0.523476368159204
epoch 4 batch id 401 loss 0.8003101348876953 train acc 0.5341334164588528
epoch 4 batch id 601 loss 0.8716274499893188 train acc 0.5465370216306157
epoch 4 batch id 801 loss 1.305177927017212 train acc 0.5473041510611736
epoch 4 batch id 1001 loss 1.2046211957931519 train acc 0.550949050949051
epoch 4 batch id 1201 loss 0.24522733688354492 train acc 0.5572309533721899
epoch 4 batch id 1401 loss 1.236859917640686 train acc 0.5525740542469665
epoch 4 batch id 1601 loss 1.2422398328781128 train acc 0.5570639444097439
epoch 4 batch id 1801 loss 1.3264189958572388 train acc 0.5588561910049972
epoch 4 batch id 2001 loss 1.2285081148147583 train acc 0.5549725137431284
epoch 4 batch id 2201 loss 0.944153904914856 train acc 0.5605619604725125
epoch 4 batch id 2401 loss 0.6064562201499939 train acc 0.5660727301124532
epoch 4 train acc 0.5669325691862867


  0%|          | 0/312 [00:00<?, ?it/s]

epoch 4 test acc 0.3094319331983806


  0%|          | 0/2421 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 1.6334104537963867 train acc 0.4375
epoch 5 batch id 201 loss 1.0014060735702515 train acc 0.533193407960199
epoch 5 batch id 401 loss 0.718906581401825 train acc 0.5553693890274314
epoch 5 batch id 601 loss 0.6888638734817505 train acc 0.5722493760399334
epoch 5 batch id 801 loss 1.1933661699295044 train acc 0.574438202247191
epoch 5 batch id 1001 loss 1.2040009498596191 train acc 0.5765484515484516
epoch 5 batch id 1201 loss 0.1983967274427414 train acc 0.5839144462947544
epoch 5 batch id 1401 loss 1.2522974014282227 train acc 0.5804559243397573
epoch 5 batch id 1601 loss 1.1566877365112305 train acc 0.5843125390381012
epoch 5 batch id 1801 loss 1.4348952770233154 train acc 0.5844496113270405
epoch 5 batch id 2001 loss 1.7557414770126343 train acc 0.5608523863068465
epoch 5 batch id 2201 loss 1.2133095264434814 train acc 0.5493951612903226
epoch 5 batch id 2401 loss 0.9773550629615784 train acc 0.5419942211578509
epoch 5 train acc 0.5412277984304007


  0%|          | 0/312 [00:00<?, ?it/s]

epoch 5 test acc 0.5204748650472335


In [26]:
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)

    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("공포가")
            elif np.argmax(logits) == 1:
                test_eval.append("놀람이")
            elif np.argmax(logits) == 2:
                test_eval.append("분노가")
            elif np.argmax(logits) == 3:
                test_eval.append("슬픔이")
            elif np.argmax(logits) == 4:
                test_eval.append("중립이")
            elif np.argmax(logits) == 5:
                test_eval.append("행복이")
            elif np.argmax(logits) == 6:
                test_eval.append("혐오가")

        print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")

In [27]:
#질문 무한반복하기! 0 입력시 종료
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0" :
        break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 오늘은 정말 우울했어
>> 입력하신 내용에서 놀람이 느껴집니다.


하고싶은 말을 입력해주세요 : 슬프다
>> 입력하신 내용에서 놀람이 느껴집니다.


하고싶은 말을 입력해주세요 : 미친 바보 아니야?
>> 입력하신 내용에서 중립이 느껴집니다.


하고싶은 말을 입력해주세요 : ㅋㅋ 아 얘 진짜 하나도 못맞추네
>> 입력하신 내용에서 분노가 느껴집니다.


하고싶은 말을 입력해주세요 : 행복해~
>> 입력하신 내용에서 슬픔이 느껴집니다.


하고싶은 말을 입력해주세요 : 와 진짜 너 진짜 최악이야........
>> 입력하신 내용에서 중립이 느껴집니다.


하고싶은 말을 입력해주세요 : 0
