In [1]:
import pandas as pd

df = pd.read_csv("../Data/lstm.csv")
df.head()

Unnamed: 0,paragraph,category
0,dishplace is located in sunnyvale downtown the...,food
1,service can be slower during busy hours but ou...,food
2,portions are huge both french toast and their ...,food
3,we started with apps going the chicken and waf...,food
4,the biscuits and gravy was too salty two peopl...,food


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   paragraph  20 non-null     object
 1   category   20 non-null     object
dtypes: object(2)
memory usage: 448.0+ bytes


In [3]:
df.category.unique()

array(['food', 'sports'], dtype=object)

> food와 sports로만 구성되어 있음

In [4]:
# paragraph에 사용된 중복없는 전체 단어 갯수를 파악
results = set()
df['paragraph'].str.lower().str.split().apply(results.update)
vocab_size = len(results)
vocab_size

536

In [9]:
# 데이터에서 가장 긴 문장의 단어 갯수를 확인
max_length = 0
for row in df['paragraph']:
    if len(row.split(" ")) > max_length:
        max_length = len(row.split(" "))

print(max_length)

91


#### 단어를 숫자로 인코딩

In [5]:
paragraphs = df['paragraph'].to_list()
paragraphs

['dishplace is located in sunnyvale downtown there is parking around the area but it can be difficult to find during peak business hours my sisters and i came to this place for dinner on a weekday they were really busy so i highly recommended making reservations unless you have the patience to wait',
 'service can be slower during busy hours but our waiter was courteous and help gave some great entree recommendations',
 'portions are huge both french toast and their various omelettes are really good their french toast is probably 1.5x more than other brunch places great place to visit if you are hungry and dont want to wait 1 hour for a table',
 'we started with apps going the chicken and waffle slides and chicken nachos the sliders were amazing and the nachos were good too maybe by themselves the nachos would have scored better but after those sliders they were up against some tough competition',
 'the biscuits and gravy was too salty two people in my group had the gravy and all thoug

In [6]:
from tensorflow import keras
keras.utils.set_random_seed(1)

In [8]:
# one hot encoding
encoded_paragraphs = [keras.preprocessing.text.one_hot(paragraph, vocab_size) \
                    for paragraph in paragraphs]
print(encoded_paragraphs)

[[421, 32, 309, 45, 9, 46, 375, 32, 308, 20, 128, 306, 226, 444, 111, 513, 411, 453, 49, 408, 335, 198, 189, 115, 400, 300, 113, 237, 453, 516, 283, 160, 218, 82, 441, 430, 224, 271, 323, 105, 138, 113, 395, 341, 91, 530, 76, 456, 388, 128, 137, 453, 392], [386, 111, 513, 490, 408, 105, 189, 226, 421, 381, 144, 179, 300, 170, 7, 425, 346, 212, 255], [466, 280, 130, 39, 463, 360, 300, 382, 234, 125, 280, 323, 83, 382, 463, 360, 32, 182, 421, 100, 467, 103, 496, 489, 80, 346, 283, 453, 388, 285, 456, 280, 438, 300, 280, 208, 453, 392, 421, 130, 160, 441, 432], [397, 74, 120, 397, 444, 128, 344, 300, 4, 389, 300, 344, 125, 128, 424, 271, 444, 300, 128, 125, 271, 83, 7, 293, 100, 385, 128, 125, 375, 388, 222, 414, 226, 152, 228, 424, 224, 271, 469, 346, 425, 434, 429], [128, 279, 300, 35, 144, 7, 306, 412, 219, 45, 115, 396, 444, 128, 35, 300, 137, 302, 444, 144, 7, 306, 115, 381, 207, 441, 467, 260, 506, 421, 300, 444, 144, 154, 82, 412, 199, 252, 22, 484, 482, 453, 169, 95, 82, 74, 252, 

In [10]:
# 문장마다 단어 갯수가 다르므로, 시퀀스패딩을 넣어서 문자의 길이를 동일하게 맞춘다.
padded_paragraphs_encodging = keras.preprocessing.sequence.pad_sequences(\
                                    encoded_paragraphs,
                                    maxlen=max_length,
                                    padding = 'post'

)

padded_paragraphs_encodging

array([[421,  32, 309, ...,   0,   0,   0],
       [386, 111, 513, ...,   0,   0,   0],
       [466, 280, 130, ...,   0,   0,   0],
       ...,
       [422,  52, 160, ...,   0,   0,   0],
       [226, 128, 175, ...,   0,   0,   0],
       [159, 237, 128, ...,   0,   0,   0]], dtype=int32)

#### 분류 항목(food, sports)를 수치로 변경하기

In [14]:
categories = df['category'].to_list()

In [15]:
def category_encode(category):
    if category == "food":
        return [1,0]
    else:
        return [0,1]

In [16]:
encoded_category = [category_encode(category) for category in categories]

In [17]:
encoded_category[:5]

[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0]]

#### Feature 확인

In [18]:
print(encoded_paragraphs[0])
print(encoded_paragraphs[19])

[421, 32, 309, 45, 9, 46, 375, 32, 308, 20, 128, 306, 226, 444, 111, 513, 411, 453, 49, 408, 335, 198, 189, 115, 400, 300, 113, 237, 453, 516, 283, 160, 218, 82, 441, 430, 224, 271, 323, 105, 138, 113, 395, 341, 91, 530, 76, 456, 388, 128, 137, 453, 392]
[159, 237, 128, 101, 138, 279, 437, 415, 2, 411, 397, 512, 160, 434, 45, 128, 348, 22, 375, 271, 90, 500, 120, 39, 244, 453, 128, 443, 39, 385, 99, 218, 230, 489, 240, 415, 467, 103, 346, 92, 478, 441, 306, 298, 300, 444, 310, 155, 453, 74, 128, 500, 500, 226, 444, 144, 437, 180, 315, 146, 520, 22, 180, 169, 412, 528, 66, 415, 160, 441, 412, 409, 530]


In [19]:
print(padded_paragraphs_encodging[0])
print(padded_paragraphs_encodging[19])

[421  32 309  45   9  46 375  32 308  20 128 306 226 444 111 513 411 453
  49 408 335 198 189 115 400 300 113 237 453 516 283 160 218  82 441 430
 224 271 323 105 138 113 395 341  91 530  76 456 388 128 137 453 392   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0]
[159 237 128 101 138 279 437 415   2 411 397 512 160 434  45 128 348  22
 375 271  90 500 120  39 244 453 128 443  39 385  99 218 230 489 240 415
 467 103 346  92 478 441 306 298 300 444 310 155 453  74 128 500 500 226
 444 144 437 180 315 146 520  22 180 169 412 528  66 415 160 441 412 409
 530   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0]


#### 주제를 분류하는 모델 구현하기

In [20]:
model = keras.Sequential()

# 문맥 생성 단계
model.add(keras.layers.Embedding(vocab_size, 5))
model.add(keras.layers.LSTM(64))

# 분류 단계
model.add(keras.layers.Dense(
                32,
                activation='relu'
))

model.add(keras.layers.Dense(
                2,
                activation='softmax'
))

model.build(input_shape=(None, max_length))

In [21]:
model.compile(
        loss='categorical_crossentropy',
        optimizer = 'adam',
        metrics=['accuracy']
)

In [22]:
# Feature와 Target의 Type 변경
import numpy as np
train_X = np.array(padded_paragraphs_encodging)
train_y = np.array(encoded_category)

In [25]:
model.fit(
    train_X,
    train_y,
    batch_size=10,
    epochs=50
)

Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.3500 - loss: 0.6946
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.5500 - loss: 0.6927
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5500 - loss: 0.6920
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.6000 - loss: 0.6914
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.6000 - loss: 0.6909
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6000 - loss: 0.6903
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.6000 - loss: 0.6897
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.6000 - loss: 0.6888
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x20bf9e1b670>

In [26]:
model.evaluate(train_X, train_y)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257ms/step - accuracy: 0.9500 - loss: 0.1674


[0.1673724353313446, 0.949999988079071]