# Naver 영화평 감성분석
- LSTM 활용
- 출처: WikiDocs
- 데이터 다운로드 링크: https://github.com/e9t/nsmc/

In [1]:
# Konlpy 설치
!pip install Konlpy

Collecting Konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.2MB/s 
[?25hCollecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/67/c3/6bed87f3b1e5ed2f34bd58bf7978e308c86e255193916be76e5a5ce5dfca/tweepy-3.10.0-py2.py3-none-any.whl
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 9.7MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/de/af/93f92b38ec1ff3091cd38982ed19cea2800fefb609b5801c41fc43c0781e/JPype1-1.2.1-cp36-cp36m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 51.3MB/s 
[?25hCollecting colorama
  Download

In [2]:
import konlpy
konlpy.__version__

'0.5.2'

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [5]:
from google.colab import files

uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving ratings_train.txt to ratings_train.txt


In [6]:
uploaded = files.upload()
testfile = list(uploaded.keys())[0]

Saving ratings_test.txt to ratings_test.txt


### 데이터 전처리

In [7]:
import pandas as pd
train_data = pd.read_table(filename)
test_data = pd.read_table(testfile)

In [8]:
train_data.shape

(150000, 3)

In [9]:
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [10]:
# 중복 여부 확인
train_data['document'].nunique()

146182

In [11]:
train_data.drop_duplicates(subset=['document'], inplace=True)
train_data.shape

(146183, 3)

In [12]:
# Null 값 확인
train_data.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [13]:
# Null 값 제거
train_data = train_data.dropna(how='any')
train_data.shape

(146182, 3)

In [14]:
# 긍정, 부정 레이블 값의 개수
train_data.label.value_counts()

0    73342
1    72840
Name: label, dtype: int64

- 테스트 데이터셋에서도 똑같이 수행

In [15]:
test_data.drop_duplicates(subset=['document'], inplace=True)
test_data.shape

(49158, 3)

In [16]:
test_data = test_data.dropna(how='any')
test_data.shape

(49157, 3)

### 한글 텍스트 전처리

In [17]:
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [18]:
train_data['document'].replace('', np.nan, inplace=True)
train_data.isnull().sum()

id            0
document    391
label         0
dtype: int64

In [19]:
train_data = train_data.dropna(how='any')
train_data.shape

(145791, 3)

In [20]:
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")
test_data['document'].replace('', np.nan, inplace=True)
test_data.isnull().sum()

id            0
document    162
label         0
dtype: int64

In [21]:
test_data = test_data.dropna(how='any')
test_data.shape

(48995, 3)

### 한글 형태소 분석

In [22]:
# 토큰화, 불용어 제거
import tqdm.notebook as tn
from konlpy.tag import Okt
stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']
okt = Okt()

X_train = []
for sentence in tn.tqdm(train_data['document']):
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True) # 토큰화
  temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
  X_train.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=145791.0), HTML(value='')))




In [23]:
X_test = []
for sentence in tn.tqdm(test_data['document']):
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True)
  temp_X = [word for word in temp_X if not word in stopwords]
  X_test.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=48995.0), HTML(value='')))




In [24]:
X_train[:3]

[['아', '더빙', '진짜', '짜증나다', '목소리'],
 ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다'],
 ['너', '무재', '밓었', '다그', '래서', '보다', '추천', '다']]

### 케라스 인코딩

In [25]:
# 정수 인코딩
max_words = 35000
tokenizer = Tokenizer(num_words=max_words) # 최초 35000개 단어만 보존
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [26]:
X_train[:3]

[[50, 454, 16, 260, 659],
 [933, 457, 41, 602, 1, 214, 1449, 24, 961, 675, 19],
 [386, 2444, 25015, 2315, 5671, 2, 222, 9]]

In [27]:
# 전체 데이터의 길이 분포
print('리뷰의 최대 길이: ', max(len(s) for s in X_train))
print('리뷰의 평균 길이: ', sum(map(len, X_train)) / len(X_train))

리뷰의 최대 길이:  69
리뷰의 평균 길이:  10.911133060339802


In [28]:
# import matplotlib.pyplot as plt


In [29]:
max_len = 30
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [30]:
y_train = train_data['label'].values
y_test = test_data['label'].values

### LSTM 모델 정의/설정/학습/평가

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

In [32]:
model = Sequential([
                    Embedding(max_words, 100),
                    LSTM(128),
                    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         3500000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 3,617,377
Trainable params: 3,617,377
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
modelcheck = ModelCheckpoint(filepath='naver-lstm-best-model.h5', monitor='val_loss', verbose=1, save_best_only=True)
earlystop = EarlyStopping(monitor='val_loss', patience=10)

In [35]:
history = model.fit(X_train, y_train, epochs=5, batch_size=60, validation_split=0.2, verbose=1, callbacks=[modelcheck, earlystop])

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.34942, saving model to naver-lstm-best-model.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.34942 to 0.34115, saving model to naver-lstm-best-model.h5
Epoch 3/5

Epoch 00003: val_loss did not improve from 0.34115
Epoch 4/5

Epoch 00004: val_loss did not improve from 0.34115
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.34115


In [36]:
from tensorflow.keras.models import load_model
best_model = load_model('naver-lstm-best-model.h5')
acc = best_model.evaluate(X_test, y_test, verbose=2)[1]
print(f'Accuracy: {acc:.4f}')

1532/1532 - 4s - loss: 0.3487 - accuracy: 0.8486
Accuracy: 0.8486


### Simple RNN

In [38]:
from tensorflow.keras.layers import SimpleRNN
model2 = Sequential([
                     Embedding(max_words, 100),
                     SimpleRNN(128),
                     Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         3500000   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               29312     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 3,529,441
Trainable params: 3,529,441
Non-trainable params: 0
_________________________________________________________________


In [39]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [40]:
modelcheck2 = ModelCheckpoint(filepath='naver-rnn-best-model.h5', monitor='val_loss', verbose=1, save_best_only=True)

In [41]:
history2 = model2.fit(X_train, y_train, epochs=4, batch_size=60, validation_split=0.2, verbose=1, callbacks=[modelcheck2, earlystop])

Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.37441, saving model to naver-rnn-best-model.h5
Epoch 2/4

Epoch 00002: val_loss did not improve from 0.37441
Epoch 3/4

Epoch 00003: val_loss did not improve from 0.37441
Epoch 4/4

Epoch 00004: val_loss did not improve from 0.37441


In [42]:
best_model2 = load_model('naver-rnn-best-model.h5')
acc2 = best_model2.evaluate(X_test, y_test, verbose=2)[1]
print(f'Accuracy: {acc2:.4f}')

1532/1532 - 5s - loss: 0.3777 - accuracy: 0.8342
Accuracy: 0.8342


### CNN + LSTM

In [44]:
from tensorflow.keras.layers import Conv1D, Dropout, MaxPooling1D
model3 = Sequential([
                     Embedding(max_words, 100),
                     Dropout(0.5),
                     Conv1D(64, 5, padding='valid', activation='relu'),
                     MaxPooling1D(pool_size=4),
                     LSTM(60),
                     Dense(1, activation='sigmoid')
])
model3.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         3500000   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 64)          32064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 64)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 60)                30000     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 61        
Total params: 3,562,125
Trainable params: 3,562,125
Non-trainable params: 0
____________________________________________

In [46]:
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath='naver-cnn-best-model.h5', monitor='val_loss', verbose=1, save_best_only=True)

history3 = model3.fit(X_train, y_train,  epochs=5, batch_size=60, validation_split=0.2, verbose=1,  callbacks=[checkpointer])

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.44729, saving model to naver-cnn-best-model.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.44729 to 0.44129, saving model to naver-cnn-best-model.h5
Epoch 3/5

Epoch 00003: val_loss did not improve from 0.44129
Epoch 4/5

Epoch 00004: val_loss did not improve from 0.44129
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.44129


<tensorflow.python.keras.callbacks.History at 0x7f03905cb9b0>

In [48]:
best_model3 = load_model('naver-cnn-best-model.h5')
acc3 = best_model3.evaluate(X_test, y_test, verbose=2)[1]
print(f'Accuracy: {acc2:.4f}')

1532/1532 - 3s - loss: 0.4509 - accuracy: 0.7766
Accuracy: 0.8342
