<a href="https://colab.research.google.com/github/Vitor-Sallenave/Spam-Classifier/blob/main/Spam_Classifier_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## ***◼️ Libraries***

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from google.colab import files

## ***◼️ Dataset***

In [None]:
# files.upload()

In [None]:
spams = pd.read_csv('spam.csv')
spams.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## ***◼️ Transforming the data***

In [None]:
# Creating a trasnformer to encode the target values: "Category"
le = LabelEncoder()
encoded_categories = le.fit_transform(spams.Category)

print(le.classes_)
print(encoded_categories)

['ham' 'spam']
[0 0 1 ... 0 0 0]


## ***◼️ Vectorizing***

In [None]:
# Creating the vectorizer
# It will use the 1000 most frequent words
tk = Tokenizer(num_words=1000)
tk.fit_on_texts(spams.Message)
sequences = tk.texts_to_sequences(spams.Message)

print(len(sequences))
print(sequences)

5572
[[49, 471, 840, 755, 658, 65, 8, 88, 123, 351, 148, 67, 58, 144], [46, 336, 472, 6], [47, 489, 8, 19, 4, 796, 901, 2, 175, 659, 261, 71, 2, 2, 337, 489, 555, 960, 73, 390, 179, 660, 391], [6, 247, 150, 23, 381, 6, 139, 153, 57, 150], [1, 98, 108, 69, 490, 2, 961, 69, 221, 112, 473], [797, 128, 67, 145, 109, 158, 21, 7, 38, 338, 89, 902, 55, 116, 413, 3, 44, 12, 14, 85, 46, 365, 960, 2, 68, 323, 234, 2], [210, 11, 633, 9, 25, 55, 2, 382, 36, 10, 110, 718, 10, 55], [72, 235, 13, 798, 118, 109, 609, 72, 13, 12, 51, 841, 392, 2, 13, 248], [719, 72, 4, 842, 440, 236, 3, 17, 109, 441, 2, 154, 962, 2, 129, 16, 129, 414, 516, 963, 581, 65], [136, 13, 97, 686, 26, 133, 6, 82, 2, 491, 2, 5, 324, 534, 903, 36, 339, 12, 47, 16, 5, 97, 491, 243, 47, 18], [30, 237, 35, 80, 222, 7, 1, 98, 70, 2, 288, 81, 40, 289, 226, 95, 208, 442, 90], [2, 175, 159, 48, 720, 2, 904, 443, 634, 73, 7, 68, 2, 373, 187, 64, 252, 391, 94, 41, 721], [195, 3, 17, 191, 4, 119, 113, 47, 8, 92, 517, 443, 154, 73, 5, 338,

In [None]:
# Padding the sequences, so that they will have the same size and be able to be
# analyzed by the neural network
sequences_padded = pad_sequences(sequences, padding='post', maxlen=500)

## ***◼️ Creating the Neural Network***

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sequences_padded,
                                                    encoded_categories,
                                                    test_size=0.3)

In [None]:
print(X_train.shape)
print(sequences_padded.shape)

(3900, 500)
(5572, 500)


In [None]:
# Creating the sequential model
model = Sequential()

In [None]:
vocabulary = len(tk.word_index)
print(vocabulary)

9004


In [None]:
# Adding layers to the model

# Creating an embedding layer
model.add(Embedding(input_dim=vocabulary, output_dim=50,
                    input_length=500))

# Flatten is used to connect Dense and Embedding Layers
model.add(Flatten())

# Units: number of neurons
model.add(Dense(units=10, activation='relu'))

# Dropout layers serve to minimize overfitting by deleting random sinapses
model.add(Dropout(0.1))

# Last layer
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Defining the compilation configuration of the model for training
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 50)           450200    
                                                                 
 flatten_1 (Flatten)         (None, 25000)             0         
                                                                 
 dense_11 (Dense)            (None, 10)                250010    
                                                                 
 dropout_7 (Dropout)         (None, 10)                0         
                                                                 
 dense_12 (Dense)            (None, 1)                 11        
                                                                 
Total params: 700221 (2.67 MB)
Trainable params: 700221 (2.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Selecting a specific layer
model.get_layer(index=0)

<keras.src.layers.core.embedding.Embedding at 0x782dfc332d10>

In [None]:
# Fitting the model
# Epochs: number of iterations
# Batch_size: The number of registers need to update the network weigths
model.fit(X_train, y_train, epochs=20, batch_size=10,
          verbose=True, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x782dfe2a0fa0>

In [None]:
# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)



In [None]:
# New predictions (probabilities)
new_predictions = model.predict(X_test)
print(new_predictions)

[[9.9709564e-01]
 [1.9334954e-07]
 [1.4005596e-08]
 ...
 [2.9883932e-08]
 [9.9997580e-01]
 [3.8545064e-08]]


In [None]:
# Defining a threshold
print(new_predictions > 0.5)

[[ True]
 [False]
 [False]
 ...
 [False]
 [ True]
 [False]]


In [None]:
# Creating the confusion matrix
cm = confusion_matrix(y_pred=(new_predictions> 0.5), y_true=y_test)
print(cm)