# Setting

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchtext import data, datasets
from torchtext.vocab import Vectors
from torchtext.data import TabularDataset
from torchtext.data import Iterator
from konlpy.tag import Mecab

import sklearn.metrics as metrics
import pandas as pd
import numpy as np
import gensim
import random
import os
import warnings
warnings.filterwarnings(action='ignore')

# 각종 전역변수들
data_path = 'data\\korean-hate-speech\\labeled'
path_train_data = 'data\\korean-hate-speech\\labeled\\train.tsv'
path_dev_data = 'data\\korean-hate-speech\\labeled\\dev.tsv'
BATCH_SIZE = 256
HIDDEN_DIM = 256
NUM_LSTM_LAYER = 2
n_classes = 3
learning_rate = 0.01
MAX_LEN = 80

# 시드 고정
SEED = 5
random.seed(SEED)
torch.manual_seed(SEED)

# CUDA setting 확인
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu와 cuda 중 다음 기기로 학습함:", DEVICE)
DEVICE = 'cpu'

cpu와 cuda 중 다음 기기로 학습함: cuda


# Torchtext dataset 작성

## Torchtext를 이용한 전처리

gensim word2vec_kor를 불러내서 torch에서 이용

In [2]:
def preprocess_label(label, only_hate=False):
    if only_hate is True:
        label_matching = {'none':0, 'offensive':1, 'hate':1}
    else:
        label_matching = {'none':0, 'offensive':1, 'hate':2}
    
    return label_matching[label]

def preprocess_bias(bias):
    label_matching = {'none':0, 'others':1, 'gender':2}
    
    return label_matching[bias]

def process_words(x):
    words = []
    for word in x:
        char = []
        for character in word:
            char.append(character)
        words.append(char)
    #print(x, len(x), char, words)
    return words

### 필드 정의

In [3]:
tokenizer = Mecab('C:\mecab\mecab-ko-dic')

# comments, hate 만 사용할 거임
comments = data.Field(sequential=True,
                      use_vocab=True,
                      tokenize=tokenizer.morphs,
                      lower=True,
                      batch_first=True, 
                      fix_length = MAX_LEN)

comments_char = data.Field(sequential=True,
                           use_vocab=True,
                           tokenize=tokenizer.morphs,
                           lower=True,
                           batch_first=True, 
                           fix_length = MAX_LEN,
                           preprocessing=lambda x:process_words(x))

contain_gender_bias = data.Field(sequential=False,
                                 use_vocab=False,
                                 batch_first=True, 
                                 preprocessing=lambda x: x =='True')

bias = data.Field(sequential=False,
                  use_vocab=False,
                  batch_first=True, 
                  preprocessing=lambda x:preprocess_bias(x))

hate = data.Field(sequential=False,
                  use_vocab=False,
                  is_target=True,
                  batch_first=True, 
                  preprocessing=lambda x:preprocess_label(x))

### 본격적인 데이터셋 구성

In [4]:
train_data, val_data = TabularDataset.splits(
    path=data_path, train='train2.tsv', validation='dev2.tsv', format='tsv',
    fields=[('comments', comments), ('contain_gender_bias', contain_gender_bias), ('bias', bias), ('hate', hate), ('comments2', comments_char)],
    skip_header=True)

train_data, test_data = train_data.split(split_ratio=0.8)

print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('검증 샘플의 개수 : {}'.format(len(val_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))
print(vars(train_data[0]))

훈련 샘플의 개수 : 6317
검증 샘플의 개수 : 471
테스트 샘플의 개수 : 1579
{'comments': ['오창', '이채', '은', '이제', '그만', '좀', '나와라', '너무', '설정', '에', '집착', '진심', '도', '없', '구', '~', '너무', '애', '여우', '같', '아', '싫증'], 'contain_gender_bias': False, 'bias': 1, 'hate': 1, 'comments2': [['오', '창'], ['이', '채'], ['은'], ['이', '제'], ['그', '만'], ['좀'], ['나', '와', '라'], ['너', '무'], ['설', '정'], ['에'], ['집', '착'], ['진', '심'], ['도'], ['없'], ['구'], ['~'], ['너', '무'], ['애'], ['여', '우'], ['같'], ['아'], ['싫', '증']]}


### 단어 집합 만들기

In [5]:
path_word2vec_kor = './data/word2vec_kor/ko.bin'
model_kor_word2vec = gensim.models.Word2Vec.load(path_word2vec_kor)
gensim_to_torch_kor_word2vec = 'torch_kor_word2vec.wv'
model_kor_word2vec.wv.save_word2vec_format(gensim_to_torch_kor_word2vec)

In [7]:
vectors = Vectors(name=gensim_to_torch_kor_word2vec)
comments.build_vocab(train_data, vectors=vectors, min_freq=3, max_size=10000)
#hate.build_vocab(train_data)
print('단어 집합의 크기 : {}'.format(len(comments.vocab)))
print('임베딩 벡터 크기: {}'.format(comments.vocab.vectors.shape))

comments_char.build_vocab(train_data, min_freq=3, max_size=10000)
print('글자 집합의 크기 : {}'.format(len(comments_char.vocab)))

단어 집합의 크기 : 4033
임베딩 벡터 크기: torch.Size([4033, 200])
글자 집합의 크기 : 1195


### 데이터 로더 만들기

본래 미니 배치 간 샘플 길이가 모두 다르지만, 앞에 torchtext 변수를 선언할때 fix_length를 이용해 통일시켜주었다. CNN에 집어넣어야하기 때문

In [8]:
"""
# 다른 방법
train_loader = Iterator(dataset=train_data, batch_size = BATCH_SIZE)
test_loader = Iterator(dataset=test_data, batch_size = BATCH_SIZE)

print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_loader)))
"""

train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train_data, val_data, test_data), batch_size=BATCH_SIZE,
        shuffle=True, repeat=False, sort_within_batch=False, sort_key=lambda x: len(x.comments))

print('훈련 데이터의 미니 배치의 개수 : {}'.format(len(train_iter)))
print('테스트 데이터의 미니 배치의 개수 : {}'.format(len(test_iter)))
print('검증 데이터의 미니 배치의 개수 : {}'.format(len(val_iter)))

훈련 데이터의 미니 배치의 개수 : 25
테스트 데이터의 미니 배치의 개수 : 7
검증 데이터의 미니 배치의 개수 : 2


# CNN feature + BiLSTM

In [9]:
class CNNfeatured_BiLSTM(nn.Module):
    def __init__(self, pre_embedding,
                 char_emb_dim, char_vocab_size,
                 hidden_dim, model_embedding, num_lstm_layer, n_classes,
                 dropout=0.3):
        super(CNNfeatured_BiLSTM, self).__init__()
        
        #variable for cnn
        channel_input_word = 1
        channel_output = 32
        self.char_emb_dim = char_emb_dim
        self.char_vocab_size = char_vocab_size
        # variable for bilstm
        self.hidden_dim = hidden_dim
        self.num_lstm_layer = num_lstm_layer
        self.n_classes = n_classes
        
        # char_cnn layer 세팅
        self.char_emb = nn.Embedding(self.char_vocab_size,
                                     self.char_emb_dim, padding_idx=0)
        self.cnn_layer = nn.Sequential(
            #nn.Conv2d(channel_input_word, channel_output, kernel_size=(3, char_emb_dim), stride=1, padding=1),
            nn.Conv2d(channel_input_word, channel_output, kernel_size=(3, char_emb_dim), stride=1),
            nn.ReLU())
        self.maxpool1d = nn.MaxPool1d(78)
        
        # embedding dimension 200
        self.embedding = nn.Embedding.from_pretrained(pre_embedding, freeze=False)
        # BiLSTM layer 세팅
        self.bi_lstm = nn.LSTM(input_size=self.embedding.embedding_dim+self.char_emb_dim,
                               hidden_size=self.hidden_dim,
                               num_layers=self.num_lstm_layer,
                               dropout=dropout,
                               batch_first=True,
                               bidirectional=True)
        
        # last linear layer 세팅
        # bidirectional 이라서 hidden_dim * 2
        self.linear = nn.Linear(self.hidden_dim * 2, self.n_classes)
                
    def forward(self, sents, chars):
        # sents:  torch.Size([128, 80]) = [batch_size, word_num_per_sentence]
        print('sents: ', sents.shape)
        print('chars: ', chars.shape)
        # embedding
        # word_embedded:  torch.Size([128, 80, 200]) = [batch_size, word_num_per_sentence, word_embedding_size]
        word_embedded = self.embedding(sents)
        print('word_embedded: ', word_embedded.shape)
        
        for i in range(chars.size(1)):
            char_embedded = self.char_emb(sents[:, 1])
            print('char_embedded: ', char_embedded.shape)
        """
        char_embedded = self.char_emb(sents)
        
        char_embedded = char_embedded.unsqueeze(1)
        print('unsqueezed char_embedded: ', char_embedded.shape)
        char_embedded = self.cnn_layer(char_embedded)
        print('after cnn_layer char_embedded: ', char_embedded.shape)
        char_embedded = char_embedded.squeeze(3)
        print('squeeze cnn_layer char_embedded: ', char_embedded.shape)
        char_embedded = self.maxpool1d(char_embedded)
        print('after maxpooling1d char_embedded: ', char_embedded.shape)
        """
        
        embedded = torch.cat([word_embedded, char_embedded], dim=-1)
        print('bilstm input: ', embedded.shape)
        
        # bilstm
        lstm_out, (h_n, c_n) = self.bi_lstm(embedded)
        
        # forward와 backward의 마지막 time-step의 은닉 상태를 가지고 와서 concat
        # 이때 모델이 batch_first라는 점에 주의한다. (dimension 순서가 바뀜)
        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim = 1)
        out=self.linear(hidden)
        
        return

# Training function 정의

In [10]:
def train(model, optimizer, train_iter):
    model.train()
    
    corrects, total_loss = 0, 0
    
    for b, batch in enumerate(train_iter):
        # comments 는 x, hate(label)은 y로 두고
        x, y = (batch.comments.to(DEVICE), batch.comments_char.to(DEVICE)), batch.hate.to(DEVICE)
        # gradient 0으로 세팅해두고
        optimizer.zero_grad()
        # model 돌리고
        prediction = model(x)
        # loss 구해서 backprop
        loss = criterion(prediction, y)
        total_loss = total_loss + loss.item()
        loss.backward()
        optimizer.step()
    
    size = len(train_iter.dataset)
    avg_loss = total_loss / size
    
    return avg_loss
        
def evaluate(model, val_iter):
    model.eval()
    
    corrects, total_loss = 0, 0
    
    for b, batch in enumerate(val_iter):
        x, y = batch.comments.to(DEVICE), batch.hate.to(DEVICE)
        prediction = model(x)
        loss = criterion(prediction, y)
        total_loss = total_loss + loss.item()
        corrects = corrects + (prediction.max(1)[1].view(y.size()).data == y.data).sum()
        
        y = y.data
        y = y.to("cpu")
        y = y.detach().numpy()
        p = prediction.max(1)[1].data
        p = p.to("cpu")
        p = p.detach().numpy()
    
    print('**** Look only last batch case ****')
    print(metrics.classification_report(y,p))
        
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    
    return avg_loss, avg_accuracy

# CNN feature+BiLSTM 선언

In [11]:
max_word_len = 10
num_filters = 32
char_emb_dim = 30
char_vocab_size = 5000
final_char_dim = 50
cnn_feat_bilstm_model = CNNfeatured_BiLSTM(pre_embedding=comments.vocab.vectors,
                                           char_emb_dim=char_emb_dim,
                                           char_vocab_size=char_vocab_size,
                                           hidden_dim=HIDDEN_DIM,
                                           model_embedding=model_kor_word2vec,
                                           num_lstm_layer=NUM_LSTM_LAYER,
                                           n_classes=n_classes)

print(cnn_feat_bilstm_model)
cnn_feat_bilstm_model = cnn_feat_bilstm_model.to(DEVICE)
optimizer = torch.optim.Adam(cnn_feat_bilstm_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

CNNfeatured_BiLSTM(
  (char_emb): Embedding(5000, 30, padding_idx=0)
  (cnn_layer): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 30), stride=(1, 1))
    (1): ReLU()
  )
  (maxpool1d): MaxPool1d(kernel_size=78, stride=78, padding=0, dilation=1, ceil_mode=False)
  (embedding): Embedding(4033, 200)
  (bi_lstm): LSTM(230, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (linear): Linear(in_features=512, out_features=3, bias=True)
)


# CNN feature+BiLSTM training and Test

In [12]:
best_val_loss = None
num_epoch = 10

print('[STAGE] Train')
for i in range(1, num_epoch+1):
    train_loss = train(cnn_feat_bilstm_model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(cnn_feat_bilstm_model, val_iter)
    result = (
        f'[Epoch: {i}/{num_epoch}] train loss : {train_loss:.4f} '
        f'| val loss : {val_loss:.4f} | val accuracy : {val_accuracy:.4f}%'
    )
    print(result)
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(cnn_bilstm_model.state_dict(), './snapshot/hate_classification_cnn_feat_bilstm.pt')
        best_val_loss = val_loss
    

[STAGE] Train


TypeError: unhashable type: 'list'

In [None]:
cnn_bilstm_model.load_state_dict(torch.load('./snapshot/hate_classification_cnn+bilstm.pt'))
test_loss, test_acc = evaluate(cnn_bilstm_model, test_iter)
result = f'테스트 오차: {test_loss:.4f} | 테스트 정확도: {test_acc:.4f}%'
print(result)

----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------
----------------------------------------

In [None]:
class CharCNN(nn.Module):
    def __init__(self,
                 max_word_len=10,
                 num_filters=32,
                 char_vocab_size=1000,
                 char_emb_dim=30,
                 final_char_dim=50,
                 dropout=0.25,
                 n_hidden_linear=100):
        super(CharCNN, self).__init__()


    def forward(self, x):
        print('x : ', x.shape)
        #batch_size = x.size(0)
        #max_seq_len = x.size(1)
        #max_word_len = x.size(2)

        x = self.char_emb(x)  # (b, s, w, d)
        x = x.view(batch_size * max_seq_len, max_word_len, -1)  # (b*s, w, d)
        x = x.transpose(2, 1)  # (b*s, d, w): Conv1d takes in (batch, dim, seq_len), but raw embedded is (batch, seq_len, dim)

        conv_lst = [conv(x) for conv in self.convs]
        conv_concat = torch.cat(conv_lst, dim=-1)  # (b*s, num_filters, len(kernel_lst))
        conv_concat = conv_concat.view(conv_concat.size(0), -1)  # (b*s, num_filters * len(kernel_lst))

        output = self.linear(conv_concat)  # (b*s, final_char_dim)
        output = output.view(batch_size, max_seq_len, -1)  # (b, s, final_char_dim)
        return output
