In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [41]:
spam = pd.read_csv('spam.csv')
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [42]:
spam.shape

(5572, 2)

In [43]:
# Vamos transformar as categorias em algo compreensível para o computador
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(spam['Category'])
print(y)

[0 0 1 ... 0 0 0]


In [44]:
mensagens = spam['Message'].values
x_train, x_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3)

In [45]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(x_train)
token.fit_on_texts(x_train)
# token.fit_on_texts(x_test)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

In [46]:
print(x_train)
print(x_test)

[[356, 136, 13, 61, 9, 79, 1, 164, 1, 357, 17, 2, 3, 85, 41, 1, 17, 117, 19, 1, 191, 41, 358, 1, 63, 52, 15, 47, 930, 154, 1, 54, 41, 9, 4, 62, 12, 3, 167, 25, 1, 17, 2, 171, 3, 54], [104, 140, 192, 94, 96, 211, 39, 12, 931, 51], [198, 81, 80, 43, 44], [58, 277, 49, 866, 102, 230], [158, 9, 130, 359, 867, 149, 58, 25, 158, 226, 149, 58, 127, 162], [248, 868, 588, 9, 488, 51, 320, 128, 52, 556, 248, 868, 622, 40, 38, 69], [22, 9, 67, 186, 1, 141, 36, 168, 35, 2, 24, 53, 331, 53, 5, 513], [82, 82, 23, 3, 8, 677], [49, 2, 932, 535, 92, 171, 14], [75, 360, 8, 557, 235, 48], [489, 1, 292, 869, 22, 28, 278], [56, 558, 25, 558, 243, 203, 6, 8, 256, 514], [33, 16, 165, 320, 8, 96, 158, 33, 51, 22, 109, 159, 623], [933, 3, 157, 128, 2, 57, 66, 445], [48, 1, 392, 12, 249, 50, 726], [111, 56, 162, 21, 64, 1, 63, 42], [199, 3, 17, 175, 4, 112, 123, 45, 8, 96, 536, 393, 145, 77, 5, 361, 119, 2, 38, 179, 120, 147, 515, 678], [589, 3, 54, 624, 1, 106, 4, 4, 25, 679, 204, 35, 5, 244, 61, 12, 934], [34

In [47]:
# Os tokens precisam ter tamanhos iguais!
x_train = pad_sequences(x_train, padding="post", maxlen=500)
x_test = pad_sequences(x_test, padding="post", maxlen=500)
x_train

array([[356, 136,  13, ...,   0,   0,   0],
       [104, 140, 192, ...,   0,   0,   0],
       [198,  81,  80, ...,   0,   0,   0],
       ...,
       [109, 332,  35, ...,   0,   0,   0],
       [ 75, 171,   3, ...,   0,   0,   0],
       [183,  39,  17, ...,   0,   0,   0]], dtype=int32)

In [48]:
len(token.word_index)

7398

In [49]:
modelo = Sequential()
modelo.add(Embedding(input_dim=len(token.word_index),output_dim=50,input_length=500))
modelo.add(Flatten())
modelo.add(Dense(units=10, activation="relu"))
modelo.add(Dropout(0.1))
modelo.add(Dense(units=1, activation='sigmoid'))



In [50]:
modelo.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

In [51]:
modelo.summary()

In [55]:
modelo.fit(x_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(x_test,y_test))

Epoch 1/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8574 - loss: 0.1296 - val_accuracy: 0.8714 - val_loss: 0.0470
Epoch 2/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8593 - loss: 0.0698 - val_accuracy: 0.9850 - val_loss: 0.0443
Epoch 3/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9129 - loss: 0.0589 - val_accuracy: 0.9868 - val_loss: 0.0345
Epoch 4/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9836 - loss: 0.0376 - val_accuracy: 0.9868 - val_loss: 0.0279
Epoch 5/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9889 - loss: 0.0247 - val_accuracy: 0.9874 - val_loss: 0.0115
Epoch 6/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9952 - loss: 0.0049 - val_accuracy: 0.9886 - val_loss: 0.0101
Epoch 7/20
[1m390/390[0m 

<keras.src.callbacks.history.History at 0x75e163abe840>

In [57]:
loss, accuracy = modelo.evaluate(x_test,y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9878 - loss: 0.0110
Loss:  0.011142415925860405
Accuracy:  0.9880383014678955


In [None]:
nova_previsao = modelo.predict(x_test)
print(nova_previsao)

In [None]:
prev = (nova_previsao > 0.5)
print(prev)

In [None]:
cm = confusion_matrix(y_test, prev)
print(cm)