In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [13]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
spam.shape

(5572, 2)

In [4]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(spam['Category'])
print(y)

[0 0 1 ... 0 0 0]


In [19]:
mensagens = spam['Message'].values
X_train, X_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3)

In [20]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)
token.fit_on_texts(X_train)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [21]:
print(X_train)

[[436, 2, 369, 28, 5, 44, 9, 207, 8, 457, 155, 57, 253, 28, 267, 728, 135], [484, 3, 10, 7, 48, 14, 9, 63, 238, 31, 3], [3, 28, 781, 3, 136, 239, 2, 919, 46, 49, 117, 203, 568, 133, 12, 835, 7, 920, 7, 1, 21, 268, 2, 919, 46, 96, 782], [10, 89, 51, 3, 63, 2, 648], [208, 212, 1, 30, 16, 3], [1, 17, 19, 602, 110, 7, 7, 106, 540, 200, 680], [103, 130, 10, 29, 9, 409], [40, 8, 5, 458, 60, 39, 17, 541, 19, 204, 6, 6, 17, 175, 5, 681, 147, 19, 125, 8, 355, 16, 22, 65, 459, 225, 213, 335, 485, 370], [126, 371, 1, 61, 921, 51, 3, 131, 4, 922, 31, 3, 372, 63, 2, 10, 1, 72, 2, 69, 4, 2, 195, 25, 13, 542, 53, 219, 12, 156, 309, 923, 437, 51, 3, 682, 45, 18, 12, 10, 2, 53, 2, 7, 836, 5, 245, 3, 113, 10, 49, 2, 34, 18, 26, 34, 3, 58, 109, 235, 137, 2, 836, 345, 309, 122, 27, 836, 302, 47, 5, 101, 240, 184, 32, 119, 11, 392], [103, 208, 393, 5, 102, 99, 47, 65, 924, 28, 73, 310, 356, 2, 37, 35, 504, 22], [12, 71, 925, 114, 148, 569, 649, 33, 74, 37, 926, 570, 240], [505, 927, 783, 2, 10], [38, 57, 2

In [22]:
X_train = pad_sequences(X_train, padding="post", maxlen=500)
X_test = pad_sequences(X_test, padding="post", maxlen=500)

In [25]:
len(token.word_index)

7519

In [23]:
X_train

array([[436,   2, 369, ...,   0,   0,   0],
       [484,   3,  10, ...,   0,   0,   0],
       [  3,  28, 781, ...,   0,   0,   0],
       ...,
       [ 92, 414, 408, ...,   0,   0,   0],
       [ 47, 148, 198, ...,   0,   0,   0],
       [292,   1,  62, ...,   0,   0,   0]], dtype=int32)

In [26]:
modelo = Sequential()
modelo.add(Embedding(input_dim=len(token.word_index),output_dim=50,input_length=500))
modelo.add(Flatten())
modelo.add(Dense(units=10, activation="relu"))
modelo.add(Dropout(0.1))
modelo.add(Dense(units=1, activation='sigmoid'))

In [27]:
modelo.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

In [28]:
modelo.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           375950    
                                                                 
 flatten (Flatten)           (None, 25000)             0         
                                                                 
 dense (Dense)               (None, 10)                250010    
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 625,971
Trainable params: 625,971
Non-trainable params: 0
_________________________________________________________________


In [29]:
modelo.fit(X_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(X_test,y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ff1d339bc40>

In [30]:
loss, accuracy = modelo.evaluate(X_test,y_test)
print("Loss: ", loss)
print("Acurácia: ", accuracy)

Loss:  0.01732429675757885
Acurácia:  0.980861246585846


In [32]:
nova_previsao = modelo.predict(X_test)
print(nova_previsao)

[[8.5130618e-11]
 [5.5945097e-09]
 [6.6293190e-08]
 ...
 [2.0365205e-15]
 [1.7968249e-08]
 [5.6360790e-04]]


In [33]:
prev = (nova_previsao > 0.5)
print(prev)

[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [34]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[1423    5]
 [  27  217]]
