# Install

In [4]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gluonnlp
  Using cached gluonnlp-0.10.0.tar.gz (344 kB)
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp38-cp38-linux_x86_64.whl size=619647 sha256=e132e3f341df7732d6c4f9060cce1f15f090827c478255680a86e9acc8b63593
  Stored in directory: /root/.cache/pip/wheels/b6/93/9d/2237550c409eb3ed725d6302b7897ddd9a037b40cef66dcd9c
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3.0.2
  Using cached 

In [5]:
#깃허브에서 KoBERT 파일 로드
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-evz5u39l
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-evz5u39l
Collecting boto3<=1.15.18
  Downloading boto3-1.15.18-py2.py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 10.5 MB/s 
Collecting mxnet<=1.7.0.post2,>=1.4.0
  Downloading mxnet-1.7.0.post2-py2.py3-none-manylinux2014_x86_64.whl (54.7 MB)
[K     |████████████████████████████████| 54.7 MB 16 kB/s 
[?25hCollecting onnxruntime<=1.8.0,==1.8.0
  Downloading onnxruntime-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 49.7 MB/s 
[?25hCollecting sentencepiece<=0.1.96,>=0.1.6
  Downloading sentencepiece-0.1.96-cp38-cp38-manyli

# 필요 라이브러리

In [6]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [7]:
#GPU 사용
device = torch.device("cuda:0")

#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


# 데이터셋 불러오기

In [8]:
import pandas as pd
chatbot_data = pd.read_excel('/content/drive/MyDrive/CapstonDesign/한국어 감정대화데이터셋.xlsx')

# 데이터 전처리

In [9]:
chatbot_data.sample(n=10)

chatbot_data.loc[(chatbot_data['Emotion'] == "공포"), 'Emotion'] = 0  #공포 => 0
chatbot_data.loc[(chatbot_data['Emotion'] == "놀람"), 'Emotion'] = 1  #놀람 => 1
chatbot_data.loc[(chatbot_data['Emotion'] == "분노"), 'Emotion'] = 2  #분노 => 2
chatbot_data.loc[(chatbot_data['Emotion'] == "슬픔"), 'Emotion'] = 3  #슬픔 => 3
chatbot_data.loc[(chatbot_data['Emotion'] == "행복"), 'Emotion'] = 4  #행복 => 4

data_list = []
for q, label in zip(chatbot_data['Sentence'], chatbot_data['Emotion'])  :
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)

print(data_list[0])
print(data_list[6000])
print(data_list[12000])
print(data_list[18000])
print(data_list[24000])
print(data_list[-1])

['언니 동생으로 부르는게 맞는 일인가요..??', '0']
['기술적으로도 아직도 해체해서 다시 완벽히 돌려놓는게 어려운데 해체를한다고?', '1']
['당연히 그렇게 해야지 우리나라도 판매를 중단하라', '2']
['그거들은 뒤부터 미치겠어요...', '3']
['대박한 앨범인 것 같아요ㅠㅠ', '4']
['유재석 오라버니 해피투게더 봤어요', '4']


# Train data & Test data

In [10]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split
                                                         
dataset_train, dataset_test = train_test_split(data_list, test_size=0.25, random_state=0)

# KoBERT 입력 데이터로 만들기

In [11]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

# 토큰화와 패딩이 잘 이루어져있는지 확인
data_train[0]

# torch 형식의 dataset
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


  data = _utils.pin_memory.pin_memory(data)


# KoBERT 학습모델 만들기

In [40]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=5,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
    
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f547fcd7b50>

# 모델 저장 및 불러오기

In [41]:
# Load model
model = torch.load('/content/drive/MyDrive/CapstonDesign/model.pt')

# 노래 추천 데이터

In [42]:
import random

def fear_music():
    #음악
    FearMusicList=[
        'Sweet but Psycho - Ava Max',
        "Don't Start Now - Dua Lipa",
        'Rockabye - Clean Bandit',
        'BAD - Christopher',
        'Shake It Off - Taylor Swift',
        'Bang Bang - Jessie J, Ariana Grande, Nicki Minaj',
        'Cake By The Ocean - Post Malone, Swae Lee',
        'Sunflower - Jake Miller',
        'Rumors - Stevie Wonder',
        'Faith (ft. Ariana Grande) - Fifth Harmony'
        ]
    return FearMusicList
        

def surprised_music():
    #음악        
    SuprisedMusicList=[
        'Moning Mood - Grieg',
        'Les Toreadors(carmen) - Bizet',
        "Salut d'amour - Elgar",
        "Piano Sonata 'Tempest' - Beethoven",
        'Eine Kleine Nachtmusik - Mozart',
        'Air on the G String - Bach',
        "Four Seasons 'Summer' - Vivaldi",
        'Piano Sonata No.21 - Mozart',
        'Fur Elise - Beethoven',
        "Four Seasons 'Winter' - Piazzolla"
        ]
    return SuprisedMusicList
    

def anger_music():
    #음악            
    AngerMusicList=[
        'Centuries - Fall Out Boy',
        'The Phoenix - Fall Out Boy',
        'Faint - Linkin Park',
        'Bleed It Out - Linkin Park',
        'Shotgun Blues - Volbeat',
        'Seal The Deal - Volbeat',
        'Fuel - Metallica',
        'Master of Puppets - Metallica',
        'Enter Sandman - Metallica',
        'Creeping Death - Metallica'
        ]
    return AngerMusicList


def sadness_music():
    #음악 
    SadnessMusicList=[
        '너의 하루는 어때? - 앤씨아',
        '아프지 말고 아프지 말자 - 우연수',
        '한숨 - 이하이',
        'Alone - Crush',
        'Stars - 로시',
        '이름에게 - 아이유',
        '마음 - 폴킴',
        '위로 - 권진아',
        '홀로 - 이하이',
        '너의 얘길 들어줄게 - 윤미래'
        ]
    return SadnessMusicList
    

def happy_music():
    HappyMusicList=[
        '아주 NICE - 세븐틴(Sevneteen)',
        'Power Up - Red Velvet(레드벨벳)',
        'ASAP - STAYC(스테이씨)',
        '음오아예(Um Oh Ah Yeh)	- 마마무(Mamamoo)',
        'SUPER Clap	- 슈퍼주니어',
        '내가 제일 잘 나가 - 2NE1',
        'IDOL - 방탄소년단',
        '강남스타일 - 싸이(PSY)',
        '나팔바지 - 싸이(PSY)',
        '빠빠빠 - 크레용 팝(Crayon Pop)'        
        ]
    return HappyMusicList

# 새로운 문장 테스트

In [43]:
 #토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)


def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("공포가")
                print(">> 오늘의 문장에서 " + test_eval[0] + " 느껴집니다.", "\n")
                FearPick = random.sample(fear_music(), 3)
                FearMusic = ('\n'.join(FearPick))
                print(FearMusic)

            elif np.argmax(logits) == 1:
                test_eval.append("놀람이")
                print(">> 오늘의 문장에서 " + test_eval[0] + " 느껴집니다.", "\n")
                SuprisedPick = random.sample(surprised_music(), 3)
                SuprisedMusic = ('\n'.join(SuprisedPick))
                print(SuprisedMusic)
                
            elif np.argmax(logits) == 2:
                test_eval.append("분노가")
                print(">> 오늘의 문장에서 " + test_eval[0] + " 느껴집니다.", "\n")
                AngerPick = random.sample(anger_music(), 3)
                AngerMusic = ('\n'.join(AngerPick))
                print(AngerMusic)

            elif np.argmax(logits) == 3:
                test_eval.append("슬픔이")
                print(">> 오늘의 문장에서 " + test_eval[0] + " 느껴집니다.", "\n")
                SadnessPick = random.sample(sadness_music(), 3)
                SadnessMusic = ('\n'.join(SadnessPick))
                print(SadnessMusic)

            elif np.argmax(logits) == 4:
                test_eval.append("행복이")
                print(">> 오늘의 문장에서 " + test_eval[0] + " 느껴집니다.", "\n")
                HappyPick = random.sample(happy_music(), 3)
                HappyMusic = ('\n'.join(HappyPick))
                print(HappyMusic)            

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [45]:
end = 1
while end == 1:
    sentence = input("감정분석을 위해 문장을 입력해주세요 : ")
    if sentence == '0':
        break
    predict(sentence)
    print("\n")

감정분석을 위해 문장을 입력해주세요 : 코딩 잘하고 싶다
>> 오늘의 문장에서 슬픔이 느껴집니다. 

너의 하루는 어때? - 앤씨아
Alone - Crush
한숨 - 이하이


감정분석을 위해 문장을 입력해주세요 : 어제 영화 지렸다
>> 오늘의 문장에서 행복이 느껴집니다. 

빠빠빠 - 크레용 팝(Crayon Pop)
ASAP - STAYC(스테이씨)
나팔바지 - 싸이(PSY)


감정분석을 위해 문장을 입력해주세요 : 침대에 누워있고싶어
>> 오늘의 문장에서 슬픔이 느껴집니다. 

마음 - 폴킴
위로 - 권진아
이름에게 - 아이유


감정분석을 위해 문장을 입력해주세요 : 와 저거 뭐냐
>> 오늘의 문장에서 놀람이 느껴집니다. 

Air on the G String - Bach
Piano Sonata No.21 - Mozart
Four Seasons 'Summer' - Vivaldi


감정분석을 위해 문장을 입력해주세요 : 안녕하세요 누구시죠
>> 오늘의 문장에서 행복이 느껴집니다. 

ASAP - STAYC(스테이씨)
IDOL - 방탄소년단
강남스타일 - 싸이(PSY)


감정분석을 위해 문장을 입력해주세요 : 0


# 참고문헌

- https://velog.io/@seolini43/KOBERT%EB%A1%9C-%EB%8B%A4%EC%A4%91-%EB%B6%84%EB%A5%98-%EB%AA%A8%EB%8D%B8-%EB%A7%8C%EB%93%A4%EA%B8%B0-%ED%8C%8C%EC%9D%B4%EC%8D%ACColab 
- https://hoit1302.tistory.com/159
- https://hipster4020.tistory.com/109
- 김경재, 「BERT 기반 감성분석을 이용한 추천시스템」, 동국대학교 일반논문, 2021.03
- https://sig413.tistory.com/5
