In [2]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers

In [3]:
print(tf.__version__)


2.20.0


In [4]:
df = pd.read_csv("sms_spam.csv")

**Proccessing the Data before training**

In [5]:
#create an array that is 0 if ham and 1 if spam
labels = (df["label"]=="spam").astype(int)

stopwords = ["the", "and", "is", "in", "to", "a", "of", "i", "you", "u", "me", "it", "for", "your", "my"]
#lower to lowercase a string, str to apply it to all strings in the column
X_texts = df["message"].str.lower()

#replace any characters that are not a-z or 0-9
X_texts = X_texts.replace(r'\W+', " ", regex=True)

#split words and remove any stopwords
X_texts  = [text.split() for text in X_texts]
X_texts = [[w for w in text if w not in stopwords] for text in X_texts]

#create a dictionary that counts the instances of each word in X_texts
vocab = Counter()
[vocab.update(text) for text in X_texts]

#create a dict of the most common words as keys and index as value
vocab = {word:idx for idx, (word,__) in enumerate(vocab.most_common(3000))}

#create an array of vectors to show how many times a word from vocab appears in each message
X_data = [[0]*len(vocab) for i in range(len(X_texts))] ##array of vectors of 0s as initialization
for i, text in enumerate(X_texts):
    for word in text: 
        if(word in vocab):
            X_data[i][vocab[word]]=+1
print(len(df[df["label"]=="spam"])/len(df))
print(len(df[df["label"]=="ham"])/len(df))


0.13406317300789664
0.8659368269921034


In [6]:
X_data = np.array(X_data) 
Y_data = np.array(labels)


X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, stratify=Y_data, test_size = 0.2)



In [8]:
model = tf.keras.Sequential([
    layers.Dense(128, activation="relu", input_shape=(len(vocab),)),
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

In [9]:
model.compile(
    optimizer = "sgd",
    loss = "binary_crossentropy",
    metrics = ["accuracy"]
)

In [10]:
model.fit(
    X_train,
    Y_train,
    epochs=1000,
    batch_size=len(X_train),
    validation_data = (X_test, Y_test)
    
)

Epoch 1/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 924ms/step - accuracy: 0.8364 - loss: 0.6688 - val_accuracy: 0.8404 - val_loss: 0.6656
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step - accuracy: 0.8499 - loss: 0.6664 - val_accuracy: 0.8511 - val_loss: 0.6632
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step - accuracy: 0.8564 - loss: 0.6640 - val_accuracy: 0.8601 - val_loss: 0.6609
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step - accuracy: 0.8618 - loss: 0.6616 - val_accuracy: 0.8628 - val_loss: 0.6585
Epoch 5/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step - accuracy: 0.8647 - loss: 0.6593 - val_accuracy: 0.8646 - val_loss: 0.6561
Epoch 6/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step - accuracy: 0.8658 - loss: 0.6569 - val_accuracy: 0.8664 - val_loss: 0.6538
Epoch 7/1000
[1m1/1[0m [3

<keras.src.callbacks.history.History at 0x11cf9a9c5b0>

In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred = model.predict(X_test)
Y_test_predict = (y_pred > 0.265).astype(int)


print("Precision:", precision_score(Y_test, Y_test_predict))  #how many were identified as spam were actually spam
print("Recall:", recall_score(Y_test, Y_test_predict))        #how many spam were not missed
print("F1 Score:", f1_score(Y_test, Y_test_predict))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Precision: 1.0
Recall: 0.7785234899328859
F1 Score: 0.8754716981132076


In [27]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(Y_test, Y_test_predict))
print(classification_report(Y_test, Y_test_predict, digits=3))

[[966   0]
 [ 33 116]]
              precision    recall  f1-score   support

           0      0.967     1.000     0.983       966
           1      1.000     0.779     0.875       149

    accuracy                          0.970      1115
   macro avg      0.983     0.889     0.929      1115
weighted avg      0.971     0.970     0.969      1115

