# Setting

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchtext import data, datasets
from torchtext.vocab import Vectors
from torchtext.data import TabularDataset
from torchtext.data import Iterator

import sklearn.metrics as metrics
import pandas as pd
import numpy as np
import gensim
import random
import os
import warnings
warnings.filterwarnings(action='ignore')

from konlpy.tag import Mecab

In [5]:
# 각종 전역변수들
data_path = 'my_korean_hate_speech\\data\\naver_review'
path_train_data = 'my_korean_hate_speech\\data\\naver_review\\ratings_train.tsv'
path_dev_data = 'my_korean_hate_speech\\data\\naver_review\\ratings_test.tsv'
BATCH_SIZE = 32
HIDDEN_DIM = 128
NUM_LSTM_LAYER = 2
n_classes = 2
learning_rate = 0.01
MAX_LEN = 120

# 시드 고정
SEED = 5
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1377c98b190>

In [6]:
# CUDA setting 확인
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu와 cuda 중 다음 기기로 학습함:", DEVICE)

cpu와 cuda 중 다음 기기로 학습함: cuda


In [7]:
# data read
train_data = pd.read_csv(path_train_data, sep='\t')
dev_data = pd.read_csv(path_dev_data, sep='\t')

print("train data shape: {}".format(train_data.shape))
print("dev data shape: {}".format(dev_data.shape))
train_data.head()

train data shape: (150000, 3)
dev data shape: (50000, 3)


Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [8]:
def preprocess_label(label, only_hate=False):
    label_matching = {'0':0, '1':1}
    
    return label_matching[label]

In [9]:
tokenizer = Mecab('C:\mecab\mecab-ko-dic')

id = data.Field(sequential=False,
                  use_vocab=False)

document = data.Field(sequential=True,
                      use_vocab=True,
                      tokenize=tokenizer.morphs,
                      lower=True,
                      batch_first=True, 
                      fix_length = MAX_LEN)

label = data.Field(sequential=False,
                   use_vocab=False,
                   is_target=True,
                   batch_first=True, 
                   preprocessing=lambda x:preprocess_label(x))

In [10]:
train_data, test_data = TabularDataset.splits(
    path=data_path, train='ratings_train.tsv', test='ratings_test.tsv', format='tsv',
    fields=[('id', id), ('document', document), ('label', label)],
    skip_header=True)

train_data, val_data = train_data.split(split_ratio=0.8)

print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('검증 샘플의 개수 : {}'.format(len(val_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))
print(vars(train_data[0]))

훈련 샘플의 개수 : 120000
검증 샘플의 개수 : 30000
테스트 샘플의 개수 : 50000
{'id': '1894102', 'document': ['ㅋㅋㅋ', 'ㅋㅋㅋ', 'ㅋㅋㅋ', 'ㅋㅋㅋ', 'ㅋㅋㅋ', 'ㅋㅋㅋ', 'ㅋㅋㅋ', '멋있', '는', '영화'], 'label': 0}


In [11]:
max_len = 0
for i in range(len(train_data)):
    if max_len < len(vars(train_data[i])['document']):
        max_len = len(vars(train_data[i])['document'])
print(max_len)

116


In [13]:
document.build_vocab(train_data, min_freq=3, max_size=30000)
#hate.build_vocab(train_data)
document.vocab.stoi

print('단어 집합의 크기 : {}'.format(len(document.vocab)))

단어 집합의 크기 : 20042


In [14]:
"""
# 다른 방법
train_loader = Iterator(dataset=train_data, batch_size = BATCH_SIZE)
test_loader = Iterator(dataset=test_data, batch_size = BATCH_SIZE)

print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_loader)))
"""

train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train_data, val_data, test_data), batch_size=BATCH_SIZE,
        shuffle=True, repeat=False, sort_within_batch=False, sort_key=lambda x: len(x.document))

print('훈련 데이터의 미니 배치의 개수 : {}'.format(len(train_iter)))
print('테스트 데이터의 미니 배치의 개수 : {}'.format(len(test_iter)))
print('검증 데이터의 미니 배치의 개수 : {}'.format(len(val_iter)))

훈련 데이터의 미니 배치의 개수 : 3750
테스트 데이터의 미니 배치의 개수 : 1563
검증 데이터의 미니 배치의 개수 : 938


## BiLSTM

In [15]:
class BiLSTM(nn.Module):
    def __init__(self, pre_embedding, hidden_dim, num_lstm_layer, n_classes, dropout=0.1):
        super(BiLSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_lstm_layer = num_lstm_layer
        self.n_classes = n_classes
        self.embedding = nn.Embedding(20042, 200)
        
        # BiLSTM layer 세팅
        self.bi_lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                               hidden_size=self.hidden_dim,
                               num_layers=self.num_lstm_layer,
                               dropout=dropout,
                               batch_first=True,
                               bidirectional=True)
        
        # bidirectional 이라서 hidden_dim * 2
        self.linear = nn.Linear(self.hidden_dim * 2, self.n_classes)
        
    def forward(self, sents):
        # embedding 
        #print('sents: ', sents)
        embedded = self.embedding(sents)
        #print('embedded: ', embedded)
        
        # lstm 통과
        lstm_out, (h_n, c_n) = self.bi_lstm(embedded) # (h_0, c_0) = (0, 0)
        
        # forward와 backward의 마지막 time-step의 은닉 상태를 가지고 와서 concat
        # 이때 모델이 batch_first라는 점에 주의한다. (dimension 순서가 바뀜)
        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim = 1)
        #print('hidden: ', hidden)
        out=self.linear(hidden)
        #print('out: ', out)
        #out=self.lin_layers(hidden)
        
        return out

In [16]:
def train(model, optimizer, train_iter):
    model.train()
    
    corrects, total_loss = 0, 0
    
    for b, batch in enumerate(train_iter):
        # comments 는 x, hate(label)은 y로 두고
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        # gradient 0으로 세팅해두고
        optimizer.zero_grad()
        # model 돌리고
        prediction = model(x)
        # loss 구해서 backprop
        loss = criterion(prediction, y)
        total_loss = total_loss + loss.item()
        loss.backward()
        optimizer.step()
    
    size = len(train_iter.dataset)
    avg_loss = total_loss / size
    
    return avg_loss
        
def evaluate(model, val_iter):
    model.eval()
    
    corrects, total_loss = 0, 0
    
    for b, batch in enumerate(val_iter):
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        prediction = model(x)
        loss = criterion(prediction, y)
        total_loss = total_loss + loss.item()
        corrects = corrects + (prediction.max(1)[1].view(y.size()).data == y.data).sum()
        
        y = y.data
        y = y.to("cpu")
        y = y.detach().numpy()
        p = prediction.max(1)[1].data
        p = p.to("cpu")
        p = p.detach().numpy()
    
    print('**** Look only last batch case ****')
    print(metrics.classification_report(y,p))
        
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    
    return avg_loss, avg_accuracy

# BiLSTM 선언

In [17]:
bilstm_model = BiLSTM(document.vocab.vectors, HIDDEN_DIM, NUM_LSTM_LAYER, n_classes)
print(bilstm_model)
bilstm_model = bilstm_model.to(DEVICE)

optimizer = torch.optim.Adam(bilstm_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

BiLSTM(
  (embedding): Embedding(20042, 200)
  (bi_lstm): LSTM(200, 128, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (linear): Linear(in_features=256, out_features=2, bias=True)
)


# BiLSTM training

In [None]:
best_val_loss = None
num_epoch = 5

print('[STAGE] Train')
for i in range(1, num_epoch+1):
    train_loss = train(bilstm_model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(bilstm_model, val_iter)
    result = (
        f'[Epoch: {i}/{num_epoch}] train loss : {train_loss:.4f} '
        f'| val loss : {val_loss:.4f} | val accuracy : {val_accuracy:.4f}%'
    )
    print(result)
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(bilstm_model.state_dict(), './snapshot/bilstm_test.pt')
        best_val_loss = val_loss

[STAGE] Train
**** Look only last batch case ****
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        13
           1       0.67      0.67      0.67         3

    accuracy                           0.88        16
   macro avg       0.79      0.79      0.79        16
weighted avg       0.88      0.88      0.88        16

[Epoch: 1/5] train loss : 0.0124 | val loss : 0.0112 | val accuracy : 84.2933%
**** Look only last batch case ****
              precision    recall  f1-score   support

           0       1.00      0.62      0.76        13
           1       0.38      1.00      0.55         3

    accuracy                           0.69        16
   macro avg       0.69      0.81      0.65        16
weighted avg       0.88      0.69      0.72        16

[Epoch: 2/5] train loss : 0.0104 | val loss : 0.0113 | val accuracy : 84.2667%
**** Look only last batch case ****
              precision    recall  f1-score   support

           

# BiLSTM Test

In [None]:
bilstm_model.load_state_dict(torch.load('./snapshot/bilstm_test.pt'))
test_loss, test_acc = evaluate(bilstm_model, test_iter)
result = f'테스트 오차: {test_loss:.4f} | 테스트 정확도: {test_acc:.4f}%'
print(result)