In [1]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3 # 최신 버전으로 설치하면 "Input: must be Tensor, not str" 라는 에러 발생
!pip install torch
!pip install torchmetrics

!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3
  Using cached transformers-3.0.0-py3-none-any.whl (754 kB)
Collecting tokenizers==0.8.0-rc4
  Using cached tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.12.1
    Uninstalling tokenizers-0.12.1:
      Successfully uninstalled tokenizers-0.12.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.20.1
    Uninstalling transformers-4.20.1:
      Successfully uninstalled transformers-4.20.1


In [24]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import pandas as pd
from tqdm import tqdm_notebook

model, vocab = get_pytorch_kobert_model()
class BERTDataset(Dataset):
      def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                  pad, pair):
          transform = nlp.data.BERTSentenceTransform(
              bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair) 

          self.sentences = [transform([i[sent_idx]]) for i in dataset]
          self.labels = [np.int32(i[label_idx]) for i in dataset]

      def __getitem__(self, i):
          return (self.sentences[i] + (self.labels[i], ))

      def __len__(self):
          return (len(self.labels))

class BERTClassifier(nn.Module):
    def __init__(self,
                bert,
                hidden_size = 768,
                num_classes = 6, # softmax 사용 <- binary일 경우는 2
                dr_rate=None,
                params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

using cached model. /content/.cache/kobert_v1.zip
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [25]:
# sent 모델 big_small = 'big' or 'small'
def kobert_sent(text, big_small, vocab, model_path):

  device = torch.device("cuda:0")

  tokenizer = get_tokenizer()
  tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

  # dict
  big_sent_idx_to_dict = {0: '기쁨', 1: '불안', 2: '당황', 3: '슬픔', 4: '분노', 5: '상처'}
  small_sent_idx_to_dict ={0: '기쁜',
    1: '스트레스 받는',
    2: '당황',
    3: '편안한',
    4: '불안',
    5: '우울한',
    6: '짜증내는',
    7: '분노',
    8: '상처',
    9: '열등감',
    10: '죄책감의',
    11: '당혹스러운',
    12: '두려운',
    13: '실망한',
    14: '염세적인',
    15: '충격 받은',
    16: '슬픔',
    17: '억울한',
    18: '외로운',
    19: '괴로워하는',
    20: '자신하는',
    21: '질투하는',
    22: '감사하는',
    23: '후회되는',
    24: '툴툴대는'}
  
 
  # model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
  sent_idx_to_dict_list = []

  # 입력값 확인
  try :
    test_sentence = str(text)
  except :
    print('문장을 확인하세요')
    pass

  # try :
  #   test_label = int(sent)
  # except :
  #   print('라벨을 확인하세요')
  #   pass

  # 감정 분류 확인 후 max_len, dict 지정
  if big_small == 'big':
    max_len = 6
    sent_dict = big_sent_idx_to_dict

  elif big_small == 'small' :
    max_len = 25
    sent_dict = small_sent_idx_to_dict

  else :
    print("감정분류를 'big' or 'small'로 입력하십시오")

  # 데이터 형식 처리
  unseen_test = pd.DataFrame([[test_sentence, 0]], columns = [['text', 'label']])
  unseen_values = unseen_test.values
  test_set = BERTDataset(unseen_values, 0, 1, tok, max_len, True, False)
  test_input = torch.utils.data.DataLoader(test_set, batch_size=1, num_workers=2)

  #load_model
  model= torch.load(model_path)


  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_input):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    out = model(token_ids, valid_length, segment_ids)
    # print('예측감정 :', sent_dict[int(out[0].argmax().cpu().numpy())] , '\n', '실제감정 :', sent_dict[test_label])
    sent_idx_to_dict_list.append(sent_dict[int(out[0].argmax().cpu().numpy())])

  return sent_idx_to_dict_list[0]

In [26]:
text1 = "오늘 하루는 너무나 힘들었어 ['SEP'] 무슨일 잇었어? ['SEP'] 하루종일 속도 안좋고 불편했는데 술을 너무 많이 마셨나봐"

In [27]:
kobert_sent(text1, 'big', vocab, '/content/drive/MyDrive/setiment_data/model_0714_Bsent_ml250_ep50.pt')

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


'기쁨'