#### Spam SMS 분류
: SMS가 스팸인지 아닌지 예측

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import zipfile
import requests
from io import BytesIO

import warnings
warnings.filterwarnings('ignore')

In [4]:
# 데이터 로드 
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)

with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
    # zip file안에 SMSSpamCollection 파일만 읽기
    with zip_ref.open("SMSSpamCollection") as file:
        data = pd.read_csv(file, sep="\t", names=['label', 'message'])

In [6]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data.tail()

Unnamed: 0,label,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
data.label.unique()

array(['ham', 'spam'], dtype=object)

#### 데이터 전처리

In [10]:
# 라벨 인코딩 (ham -> 0, spam -> 1)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

In [11]:
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [47]:
# 텍스트 토큰화 및 시퀀스 패딩
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(data['message'])
X = tokenizer.texts_to_sequences(data['message'])
X = pad_sequences(X, maxlen=100)

In [15]:
# 타겟 데이터
y = data['label'].values
y

array([0, 0, 1, ..., 0, 0, 0])

In [16]:
X

array([[   0,    0,    0, ...,   58, 4437,  144],
       [   0,    0,    0, ...,  472,    6, 1940],
       [   0,    0,    0, ...,  660,  392, 2998],
       ...,
       [   0,    0,    0, ...,   23,  107,  251],
       [   0,    0,    0, ...,  200,   12,   47],
       [   0,    0,    0, ...,    2,   61,  268]], dtype=int32)

In [17]:
# 훈련과 테스트
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y,
                                        test_size=0.2,
                                        random_state=42
)

#### LSTM Model

In [48]:
model = Sequential()
model.add(Embedding(5000, 5))
model.add(LSTM(64, return_sequences=True))
#--------------------------------
model.add(Dropout(0.5)) # Dropout 
model.add(LSTM(32)) # Layer 추가 
model.add(Dropout(0.5))
#----------------------------------
model.add(Dense(1, activation='sigmoid'))

model.build(input_shape=(None, 100))

In [49]:
# 모델 컴파일
model.compile(
        optimizer = 'adam',
        loss = 'binary_crossentropy',
        metrics = ['accuracy']
)

In [50]:
# 모델 훈련
history = model.fit(
            X_train,
            y_train,
            epochs=5,
            batch_size=64,
            validation_data=(X_test, y_test)
)

Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 55ms/step - accuracy: 0.8930 - loss: 0.3316 - val_accuracy: 0.9345 - val_loss: 0.1713
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.9706 - loss: 0.1155 - val_accuracy: 0.9794 - val_loss: 0.0648
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.9859 - loss: 0.0637 - val_accuracy: 0.9883 - val_loss: 0.0474
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.9904 - loss: 0.0452 - val_accuracy: 0.9910 - val_loss: 0.0389
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.9939 - loss: 0.0336 - val_accuracy: 0.9910 - val_loss: 0.0411


In [45]:
# train 점수
model.evaluate(X_train, y_train)

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9937 - loss: 0.0265


[0.026517929509282112, 0.9937177300453186]

In [46]:
# Valid 점수
model.evaluate(X_test, y_test)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9883 - loss: 0.0416


[0.04160343110561371, 0.9883407950401306]