# Method

In [2]:
# dataFrame을 넣고 실행하면 X_data, y_data, vocab_size, max_len를 돌려주는 get_X_y_vacabsize_maxlen method
# @name   : get_X_y_vacabsize_maxlen
# @pram   : DataFrame
# @return : np.array, list, int, int
def get_X_y_vacabsize_maxlen(df):
    X_data = df['comment']
    y_data = df['bad']
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_data)
    sequences = tokenizer.texts_to_sequences(X_data)
    df['comment'] = sequences
    word_to_index = tokenizer.word_index
    vocab_size = len(word_to_index)+1
    max_len = max(len(l) for l in sequences)
    data = pad_sequences(sequences, maxlen=max_len)
    X_data = (data)
    y_data = df['bad']
    return X_data, y_data, vocab_size, max_len

# vocab_size, max_len를 넣고 실행하면 CNN모델을 만들어주는 CNNmodel method
# @name   : CNNmodel
# @pram   : int, int
# @return : Sequential
def CNNmodel(vocab_size, max_len):
    model = Sequential()
    model.add(Embedding(vocab_size, 32, input_length=max_len))
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))

    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# vocab_size, max_len를 넣고 실행하면 LSTM모델을 만들어주는 LSTMmodel method
# @name   : LSTMmodel
# @pram   : int, int
# @return : Sequential
def LSTMmodel(vocab_size,max_len):
    model = Sequential()
    model.add(Embedding(vocab_size, 32))
    model.add(LSTM(max_len, activation='tanh'))
    model.add(Dense(units=1))
    model.add(Activation('relu'))
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    return model

# vocab_size를 넣으면 RNN모델을 만들어주는 RNNmodel method
# @name   : RNNmodel
# @pram   : int
# @return : Sequential
def RNNmodel(vocab_size):
    model = Sequential()
    model.add(Embedding(vocab_size, 32)) # 임베딩 벡터의 차원은 32
    model.add(SimpleRNN(32)) # RNN 셀의 hidden_size는 32
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    return model

# import

In [3]:
from keras.preprocessing import sequence

from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pyprind
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, SimpleRNN, Embedding, Dense, Activation, Dropout, Flatten

Using TensorFlow backend.


## 처리 방식에 따른 네 가지 데이터 호출

In [5]:
sulliComments_nouns = pd.read_csv('sulliComments_nouns.csv',encoding='cp949')
sulliComments_nounsJaso = pd.read_csv('sulliComments_nounsJaso.csv',encoding='cp949')
sulliComments_morps = pd.read_csv('sulliComments_morps.csv',encoding='cp949')
sulliComments_morpsJaso = pd.read_csv('sulliComments_morpsJaso.csv',encoding='cp949')

## 처리 방식 별 X, y, vocab_size, max_len 지정

In [6]:
nouns_X_data,nouns_y_data, nouns_vocab_size, nouns_max_len = get_X_y_vacabsize_maxlen(sulliComments_nouns)
nounsJaso_X_data,nounsJaso_y_data, nounsJaso_vocab_size, nounsJaso_max_len = get_X_y_vacabsize_maxlen(sulliComments_nounsJaso)
morps_X_data,morps_y_data, morps_vocab_size, morps_max_len = get_X_y_vacabsize_maxlen(sulliComments_morps)
morpsJaso_X_data,morpsJaso_y_data, morpsJaso_vocab_size, morpsJaso_max_len = get_X_y_vacabsize_maxlen(sulliComments_morpsJaso)

## 처리 방식 별 train, valid, test set 생성(80:20rule)

In [7]:
nouns_X_train, nouns_X_test, nouns_y_train, nouns_y_test = train_test_split(nouns_X_data, nouns_y_data, test_size=0.2, random_state=0)
nouns_X_train, nouns_X_valid, nouns_y_train, nouns_y_valid = train_test_split(nouns_X_train, nouns_y_train, test_size=0.2, random_state=0)

nounsJaso_X_train, nounsJaso_X_test, nounsJaso_y_train, nounsJaso_y_test = train_test_split(nounsJaso_X_data, nounsJaso_y_data, test_size=0.2, random_state=0)
nounsJaso_X_train, nounsJaso_X_valid, nounsJaso_y_train, nounsJaso_y_valid = train_test_split(nounsJaso_X_train, nounsJaso_y_train, test_size=0.2, random_state=0)

morps_X_train, morps_X_test, morps_y_train, morps_y_test = train_test_split(morps_X_data, morps_y_data, test_size=0.2, random_state=0)
morps_X_train, morps_X_valid, morps_y_train, morps_y_valid = train_test_split(morps_X_train, morps_y_train, test_size=0.2, random_state=0)

morpsJaso_X_train, morpsJaso_X_test, morpsJaso_y_train, morpsJaso_y_test = train_test_split(morpsJaso_X_data, morpsJaso_y_data, test_size=0.2, random_state=0)
morpsJaso_X_train, morpsJaso_X_valid, morpsJaso_y_train, morpsJaso_y_valid = train_test_split(morpsJaso_X_train, morpsJaso_y_train, test_size=0.2, random_state=0)

## 처리 방식 별 12개의 모델 생성

In [8]:
nouns_RNN_model = RNNmodel(nouns_vocab_size)
nouns_LSTM_model = LSTMmodel(nouns_vocab_size, nouns_max_len)
nouns_CNN_model = CNNmodel(nouns_vocab_size, nouns_max_len)

nounsJaso_RNN_model = RNNmodel(nounsJaso_vocab_size)
nounsJaso_LSTM_model = LSTMmodel(nounsJaso_vocab_size, nounsJaso_max_len)
nounsJaso_CNN_model = CNNmodel(nounsJaso_vocab_size, nounsJaso_max_len)

morps_RNN_model = RNNmodel(morps_vocab_size)
morps_LSTM_model = LSTMmodel(morps_vocab_size, morps_max_len)
morps_CNN_model = CNNmodel(morps_vocab_size, morps_max_len)

morpsJaso_RNN_model = RNNmodel(morpsJaso_vocab_size)
morpsJaso_LSTM_model = LSTMmodel(morpsJaso_vocab_size, morpsJaso_max_len)
morpsJaso_CNN_model = CNNmodel(morpsJaso_vocab_size, morpsJaso_max_len)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


# Training

In [9]:
nouns_RNN_history = nouns_RNN_model.fit(nouns_X_train, nouns_y_train, epochs=15, batch_size=60, validation_data=(nouns_X_valid,nouns_y_valid))
print("nouns_RNN 테스트 정확도: %.4f" % (nouns_RNN_model.evaluate(nouns_X_test, nouns_y_test)[1]))

Train on 768 samples, validate on 192 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
nouns_RNN 테스트 정확도: 0.8542


In [10]:
nouns_LSTM_history = nouns_LSTM_model.fit(nouns_X_train, nouns_y_train, epochs=7, batch_size=60, validation_data=(nouns_X_valid,nouns_y_valid))
print("nouns_LSTM 테스트 정확도: %.4f" % (nouns_LSTM_model.evaluate(nouns_X_test, nouns_y_test)[1]))

Train on 768 samples, validate on 192 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
nouns_LSTM 테스트 정확도: 0.8708


In [11]:
nouns_CNN_history = nouns_CNN_model.fit(nouns_X_train, nouns_y_train, epochs=17, batch_size=60, validation_data=(nouns_X_valid,nouns_y_valid))
print("nouns_CNN 테스트 정확도: %.4f" % (nouns_CNN_model.evaluate(nouns_X_test, nouns_y_test)[1]))


Train on 768 samples, validate on 192 samples
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
nouns_CNN 테스트 정확도: 0.9042


In [12]:
nounsJaso_RNN_history = nounsJaso_RNN_model.fit(nounsJaso_X_train, nounsJaso_y_train, epochs=15, batch_size=60, validation_data=(nounsJaso_X_valid,nounsJaso_y_valid))
print("nounsJaso_RNN 테스트 정확도: %.4f" % (nounsJaso_RNN_model.evaluate(nounsJaso_X_test, nounsJaso_y_test)[1]))

Train on 768 samples, validate on 192 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
nounsJaso_RNN 테스트 정확도: 0.8917


In [13]:
nounsJaso_LSTM_history = nounsJaso_LSTM_model.fit(nounsJaso_X_train, nounsJaso_y_train, epochs=7, batch_size=60, validation_data=(nounsJaso_X_valid,nounsJaso_y_valid))
print("nounsJaso_LSTM 테스트 정확도: %.4f" % (nounsJaso_LSTM_model.evaluate(nounsJaso_X_test, nounsJaso_y_test)[1]))

Train on 768 samples, validate on 192 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
nounsJaso_LSTM 테스트 정확도: 0.8917


In [24]:
nounsJaso_CNN_history = nounsJaso_CNN_model.fit(nounsJaso_X_train, nounsJaso_y_train, epochs=17, batch_size=60, validation_data=(nounsJaso_X_valid,nounsJaso_y_valid))
print("nounsJaso_CNN 테스트 정확도: %.4f" % (nounsJaso_CNN_model.evaluate(nounsJaso_X_test, nounsJaso_y_test)[1]))

Train on 768 samples, validate on 192 samples
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
nounsJaso_CNN 테스트 정확도: 0.9125


In [15]:
morps_RNN_history = morps_RNN_model.fit(morps_X_train, morps_y_train, epochs=15, batch_size=60, validation_data=(morps_X_valid,morps_y_valid))
print("morps_RNN 테스트 정확도: %.4f" % (morps_RNN_model.evaluate(morps_X_test, morps_y_test)[1]))


Train on 768 samples, validate on 192 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
morps_RNN 테스트 정확도: 0.8458


In [16]:
morps_LSTM_history = morps_LSTM_model.fit(morps_X_train, morps_y_train, epochs=7, batch_size=60, validation_data=(morps_X_valid,morps_y_valid))
print("morps_LSTM 테스트 정확도: %.4f" % (morps_LSTM_model.evaluate(morps_X_test, morps_y_test)[1]))


Train on 768 samples, validate on 192 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
morps_LSTM 테스트 정확도: 0.8667


In [17]:
morps_CNN_history = morps_CNN_model.fit(morps_X_train, morps_y_train, epochs=17, batch_size=60, validation_data=(morps_X_valid,morps_y_valid))
print("morps_CNN 테스트 정확도: %.4f" % (morps_CNN_model.evaluate(morps_X_test, morps_y_test)[1]))


Train on 768 samples, validate on 192 samples
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
morps_CNN 테스트 정확도: 0.9083


In [26]:
morpsJaso_RNN_history = morpsJaso_RNN_model.fit(morpsJaso_X_train, morpsJaso_y_train, epochs=15, batch_size=60, validation_data=(morpsJaso_X_valid,morpsJaso_y_valid))
print("morpsJaso_RNN 테스트 정확도: %.4f" % (morpsJaso_RNN_model.evaluate(morpsJaso_X_test, morpsJaso_y_test)[1]))


Train on 768 samples, validate on 192 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
morpsJaso_RNN 테스트 정확도: 0.8708


In [36]:
morpsJaso_LSTM_model = LSTMmodel(morpsJaso_vocab_size, morpsJaso_max_len)


morpsJaso_LSTM_history = morpsJaso_LSTM_model.fit(morpsJaso_X_train, morpsJaso_y_train, epochs=7, batch_size=60, validation_data=(morpsJaso_X_valid,morpsJaso_y_valid))
print("morpsJaso_LSTM 테스트 정확도: %.4f" % (morpsJaso_LSTM_model.evaluate(morpsJaso_X_test, morpsJaso_y_test)[1]))


Train on 768 samples, validate on 192 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
morpsJaso_LSTM 테스트 정확도: 0.8833


In [20]:
morpsJaso_CNN_history = morpsJaso_CNN_model.fit(morpsJaso_X_train, morpsJaso_y_train, epochs=17, batch_size=60, validation_data=(morpsJaso_X_valid,morpsJaso_y_valid))
print("morpsJaso_CNN 테스트 정확도: %.4f" % (morpsJaso_CNN_model.evaluate(morpsJaso_X_test, morpsJaso_y_test)[1]))

Train on 768 samples, validate on 192 samples
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
morpsJaso_CNN 테스트 정확도: 0.9000


# Test set Accuracy

In [37]:
#명사 테스트 정확도
print("nouns_RNN 테스트 정확도: %.4f" % (nouns_RNN_model.evaluate(nouns_X_test, nouns_y_test)[1]))
print("nouns_LSTM 테스트 정확도: %.4f" % (nouns_LSTM_model.evaluate(nouns_X_test, nouns_y_test)[1]))
print("nouns_CNN 테스트 정확도: %.4f" % (nouns_CNN_model.evaluate(nouns_X_test, nouns_y_test)[1]))

#명사 자소 분리 테스트 정확도
print("nounsJaso_RNN 테스트 정확도: %.4f" % (nounsJaso_RNN_model.evaluate(nounsJaso_X_test, nounsJaso_y_test)[1]))
print("nounsJaso_LSTM 테스트 정확도: %.4f" % (nounsJaso_LSTM_model.evaluate(nounsJaso_X_test, nounsJaso_y_test)[1]))
print("nounsJaso_CNN 테스트 정확도: %.4f" % (nounsJaso_CNN_model.evaluate(nounsJaso_X_test, nounsJaso_y_test)[1]))

#모든 형태소 정확도
print("morps_RNN 테스트 정확도: %.4f" % (morps_RNN_model.evaluate(morps_X_test, morps_y_test)[1]))
print("morps_LSTM 테스트 정확도: %.4f" % (morps_LSTM_model.evaluate(morps_X_test, morps_y_test)[1]))
print("morps_CNN 테스트 정확도: %.4f" % (morps_CNN_model.evaluate(morps_X_test, morps_y_test)[1]))

#모든 형태소 자소 분리 정확도
print("morpsJaso_RNN 테스트 정확도: %.4f" % (morpsJaso_RNN_model.evaluate(morpsJaso_X_test, morpsJaso_y_test)[1]))
print("morpsJaso_LSTM 테스트 정확도: %.4f" % (morpsJaso_LSTM_model.evaluate(morpsJaso_X_test, morpsJaso_y_test)[1]))
print("morpsJaso_CNN 테스트 정확도: %.4f" % (morpsJaso_CNN_model.evaluate(morpsJaso_X_test, morpsJaso_y_test)[1]))

nouns_RNN 테스트 정확도: 0.8542
nouns_LSTM 테스트 정확도: 0.8708
nouns_CNN 테스트 정확도: 0.9042
nounsJaso_RNN 테스트 정확도: 0.8917
nounsJaso_LSTM 테스트 정확도: 0.8917
nounsJaso_CNN 테스트 정확도: 0.9125
morps_RNN 테스트 정확도: 0.8458
morps_LSTM 테스트 정확도: 0.8667
morps_CNN 테스트 정확도: 0.9083
morpsJaso_RNN 테스트 정확도: 0.8708
morpsJaso_LSTM 테스트 정확도: 0.8833
morpsJaso_CNN 테스트 정확도: 0.9000
