# 스팸 분류

In [1]:
import numpy as np
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
docs = ['additional income',
        'best price',
        'big bucks',
        'cash bonus',
        'earn extra cash',
        'spring savings certificate',
        'valero gas marketing',
        'all domestic employees',
        'nominations for oct',
        'confirmation from spinner']

In [3]:
labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]

In [4]:
print(encoded_docs)

[[42, 12], [28, 43], [21, 21], [10, 16], [14, 18, 10], [7, 31, 29], [40, 24, 17], [9, 10, 10], [24, 48, 36], [35, 8, 36]]


In [5]:
max_len = 3
padded_docs = pad_sequences(encoded_docs, maxlen=max_len, padding='post')
print(padded_docs)

[[42 12  0]
 [28 43  0]
 [21 21  0]
 [10 16  0]
 [14 18 10]
 [ 7 31 29]
 [40 24 17]
 [ 9 10 10]
 [24 48 36]
 [35  8 36]]


## 모델 생성

In [6]:
model = Sequential()

$50 \times 8$ 의 랜덤한 임베딩 레이어

입력의 `[-1]` 값, `[47 43 17]`의 경우
* `47` $\Rightarrow$ `Embedding` $\Rightarrow$ $1 \times 8$ vector
* `43` $\Rightarrow$ `Embedding` $\Rightarrow$ $1 \times 8$ vector
* `17` $\Rightarrow$ `Embedding` $\Rightarrow$ $1 \times 8$ vector

생성한 세 벡터를 합쳐서 $1 \times 8 \times 3$ matrix

In [7]:
model.add(Embedding(vocab_size, 8, input_length=max_len))



* $1 \times 8 \times 3$ matrix $\Rightarrow$ `Flatten` $\Rightarrow$ $1 \times 24$ vector

In [8]:
model.add(Flatten())

$1 \times 24$ $\Rightarrow$ `Dense` $\Rightarrow$ 스팸일 확률

In [9]:
model.add(Dense(1, activation='sigmoid'))

In [10]:
model.summary()

**모델 컴파일 및 평가**

In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

2024-06-12 13:30:13.177368: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-06-12 13:30:13.177411: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-06-12 13:30:13.177450: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-06-12 13:30:13.177497: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-12 13:30:13.177521: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
model.fit(padded_docs, labels, epochs=250, verbose=0)

2024-06-12 13:30:13.421543: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


<keras.src.callbacks.history.History at 0x16b199a20>

In [13]:

loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print(accuracy)

1.0


In [14]:
test_doc = ['big income']
encoded_docs = [one_hot(d, vocab_size) for d in test_doc]
print(encoded_docs)
padded_docs = pad_sequences(encoded_docs, maxlen=max_len, padding='post')
print(padded_docs)

[[21, 12]]
[[21 12  0]]


In [15]:
print(model.predict(padded_docs))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step
[[0.9164098]]
