<a href="https://colab.research.google.com/github/Vitor-Sallenave/Formacao-em-NLP/blob/main/ML-and-DL/Spam_Classifier_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## ***◼️ Libraries***

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout
from google.colab import files

## ***◼️ Dataset***

In [None]:
files.upload()

In [None]:
spams = pd.read_csv('spam.csv')
spams.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## ***◼️ Transforming the data***

In [None]:
# Creating a trasnformer to encode the target values: "Category"
le = LabelEncoder()
encoded_categories = le.fit_transform(spams.Category)

print(le.classes_)
print(encoded_categories)

['ham' 'spam']
[0 0 1 ... 0 0 0]


## ***◼️ Vectorizing***

In [None]:
# Creating the vectorizer in aim to apply to the text messages
cv = CountVectorizer()
matrix = cv.fit_transform(spams.Message)
matrix.shape

(5572, 8709)

## ***◼️ Creating the Neural Network***

In [None]:
X_train, X_test, y_train, y_test = train_test_split(matrix, encoded_categories,
                                                    test_size=0.3)

In [None]:
print(X_train.shape)
print(matrix.shape)

(3900, 8709)
(5572, 8709)


In [None]:
print(X_train.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
# Creating the sequential model
model = Sequential()

In [None]:
# Adding layers to the model
# Units: number of neurons
model.add(Dense(units=10, activation='relu', input_dim=X_train.shape[1]))
# Dropout layers serve to minimize overfitting by deleting random sinapses
model.add(Dropout(0.1))
model.add(Dense(8, activation='relu'))
model.add(Dropout(0.1))

# Last layer
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Defining the compilation configuration of the model for training
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

Model: "My first model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                87100     
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 8)                 88        
                                                                 
 dropout_1 (Dropout)         (None, 8)                 0         
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 87197 (340.61 KB)
Trainable params: 87197 (340.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Selecting a specific layer
model.get_layer(index=0)

<keras.src.layers.core.dense.Dense at 0x78db4337bd90>

In [None]:
# Fitting the model
# Epochs: number of iterations
# Batch_size: The number of registers need to update the network weigths
model.fit(X_train, y_train, epochs=20, batch_size=10,
          verbose=True, validation_data=(X_test, y_test))

NameError: ignored

In [None]:
# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss = {loss} | Accuracy = {accuracy}')

In [None]:
# New predictions (probabilities)
new_predictions = model.predict(X_test)
print(new_predictions)

In [None]:
# Defining a threshold
print(new_predictions > 0.5)

In [None]:
# Creating the confusion matrix
cm = confusion_matrix(y_pred=new_predictions, y_true=y_test)
print(cm)