In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./Data/lstm.csv')

In [3]:
df.category.unique()

array(['food', 'sports'], dtype=object)

In [4]:
df['paragraph'][0]

'dishplace is located in sunnyvale downtown there is parking around the area but it can be difficult to find during peak business hours my sisters and i came to this place for dinner on a weekday they were really busy so i highly recommended making reservations unless you have the patience to wait'

### DF에 단어 개수 파악하기

In [5]:
# 데이터에 사용된 중복없는 전체 단어 갯수를 파악
result = set()

# 해당 단어 개수 파악하는 방법 : set.update 알아봐야 될 듯
df['paragraph'].str.lower().str.split().apply(result.update)

vacab_size = len(result)
vacab_size

536

### 단어를 숫자로 인코딩

In [6]:

paragraphs = df['paragraph'].to_list()
paragraphs[0:5]

['dishplace is located in sunnyvale downtown there is parking around the area but it can be difficult to find during peak business hours my sisters and i came to this place for dinner on a weekday they were really busy so i highly recommended making reservations unless you have the patience to wait',
 'service can be slower during busy hours but our waiter was courteous and help gave some great entree recommendations',
 'portions are huge both french toast and their various omelettes are really good their french toast is probably 1.5x more than other brunch places great place to visit if you are hungry and dont want to wait 1 hour for a table',
 'we started with apps going the chicken and waffle slides and chicken nachos the sliders were amazing and the nachos were good too maybe by themselves the nachos would have scored better but after those sliders they were up against some tough competition',
 'the biscuits and gravy was too salty two people in my group had the gravy and all thoug

In [9]:
from tensorflow import keras

In [81]:
keras.utils.set_random_seed(1)

In [10]:
# one-hot encoding : 어휘 사전 만들어 주기
encoded_paragraphs = [keras.preprocessing.text.one_hot(paragraph ,vacab_size) for paragraph in paragraphs]
encoded_paragraphs[1]

[71,
 308,
 474,
 89,
 72,
 368,
 24,
 231,
 408,
 387,
 321,
 241,
 263,
 142,
 306,
 274,
 307,
 397,
 448]

### Zero Padding 하기 위해 가장 긴 문자 값 찾기

In [11]:
# 데이터에서 가장 긴 문장의 단어 갯수를 확인
max_length = 0 

for row in df['paragraph']:
    temp = len(row.split())
    if temp > max_length:
        max_length = temp

max_length

91

In [12]:
import numpy as np

In [13]:
# 문장마다 단어 갯수가 다르므로, 제로 패딩을 넣어줘서 문장의 길이를 동일하게 만든다.
padded_paragraphs_encoding = keras.preprocessing.sequence.pad_sequences(encoded_paragraphs,
                                                                        maxlen=max_length,
                                                                        padding='post' # 0값을 뒤에 채우겠다.
                                                                        )

In [14]:
padded_paragraphs_encoding

array([[  8, 515, 467, ...,   0,   0,   0],
       [ 71, 308, 474, ...,   0,   0,   0],
       [523, 242, 414, ...,   0,   0,   0],
       ...,
       [408, 149, 258, ...,   0,   0,   0],
       [231, 313,  63, ...,   0,   0,   0],
       [146, 171, 313, ...,   0,   0,   0]], dtype=int32)

### 타겟 데이터 숫자로 바꿔주기

In [15]:
categories = df['category'].to_list()

In [16]:
def category_encode(category):
    if category == 'food':
        return [1,0]
    else:
        return [0,1]

In [17]:
encoded_category = []
for category in categories:
    encoded_category.append(category_encode(category))

encoded_category[19]

[0, 1]

In [18]:
# feature 확인
print(encoded_paragraphs[19])

[146, 171, 313, 150, 522, 300, 407, 295, 101, 160, 150, 15, 258, 46, 372, 313, 463, 352, 118, 257, 171, 474, 375, 249, 469, 84, 313, 439, 249, 446, 328, 518, 318, 337, 307, 295, 413, 203, 533, 367, 523, 40, 157, 51, 263, 44, 484, 190, 84, 415, 313, 174, 443, 231, 87, 321, 407, 209, 287, 434, 313, 433, 24, 328, 147, 40, 439, 295, 258, 40, 207, 251, 316]


### 주제를 분류하는 모델 구현하기
- rmsprop 말고 lstm을 써서 반복하는 걸 더 많이 하는 추세이다.

In [19]:
model = keras.Sequential()

# 문맥 생성단계
model.add(keras.layers.Embedding(vacab_size,
                                 5,
                                 input_length=max_length
                                 ))
model.add(keras.layers.LSTM(64))

# 분류 단계
model.add(keras.layers.Dense(32,activation='relu'))
model.add(keras.layers.Dense(2,activation='softmax'))

In [20]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics='accuracy')

In [21]:
train_X = np.array(padded_paragraphs_encoding)
train_y = np.array(encoded_category)

In [22]:
history = model.fit(train_X,train_y,
                    epochs=100,
                    batch_size=10 # 줄일 수록 디테일 해진다.
                    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78