In this project we're trying to make a spam detection model for emails using LSTM


In [58]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense,Input,GlobalMaxPooling1D,LSTM,Embedding,TextVectorization
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

In [59]:
!wget -nc https://lazyprogrammer.me/course_files/spam.csv

File ‘spam.csv’ already there; not retrieving.



In [71]:
#Formatting the dataset
df = pd.read_csv("spam.csv",encoding = "ISO-8859-1")
df = df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis = 1)
df.columns = ["label","data"]
df["b_label"] = df["label"].map({"ham": 0,"spam":1})
df.head()

Unnamed: 0,label,data,b_label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [61]:
Y = df["b_label"].values
df_train,df_test,Ytrain,Ytest = train_test_split(df["data"],Y,train_size = 0.33)

In [62]:
#create df dataset
train_ds = tf.data.Dataset.from_tensor_slices((df_train.values,Ytrain))
test_ds = tf.data.Dataset.from_tensor_slices((df_test.values,Ytest))
train_ds.map(lambda x,y:x)

<_MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [63]:
#convert sentence to sequence
MAX_VOCAB_SIZE = 20_000
vectorization = TextVectorization(max_tokens=MAX_VOCAB_SIZE)
vectorization.adapt(train_ds.map(lambda x,y:x))

In [64]:
#shuffle and batch dataset
train_ds = train_ds.shuffle(10000).batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)

In [65]:
V = len(vectorization.get_vocabulary())

In [66]:
#Create the Model

#We get to choose Embedding dimensionality
D = 20

#Hidden state dimensionality
M = 15

#if string not specified it will return float casting error
i = Input(shape = (),dtype=tf.string)
x = vectorization(i)
x = Embedding(V,D)(x)
x = LSTM(M,return_sequences = True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1,activation = "sigmoid")(x)

model = Model(i,x)


In [67]:
#Compiling the model
model.compile(loss = "binary_crossentropy",optimizer = "adam",metrics = ["accuracy"])

In [68]:
model.fit(train_ds,validation_data = test_ds,epochs = 10)

Epoch 1/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.8654 - loss: 0.5365 - val_accuracy: 0.8688 - val_loss: 0.3793
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.8614 - loss: 0.3844 - val_accuracy: 0.8696 - val_loss: 0.3311
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9021 - loss: 0.2481 - val_accuracy: 0.9772 - val_loss: 0.1400
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.9909 - loss: 0.0940 - val_accuracy: 0.9783 - val_loss: 0.0974
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9918 - loss: 0.0602 - val_accuracy: 0.9767 - val_loss: 0.0931
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9979 - loss: 0.0355 - val_accuracy: 0.9687 - val_loss: 0.1007
Epoch 7/10
[1m58/58[0m [32m━━━━

<keras.src.callbacks.history.History at 0x781582f647d0>

In [89]:

def predict_spam(text):

    text_tensor = tf.constant([text])
    prob = model.predict(text_tensor)[0][0]
    if prob > 0.5:
        return f"The email is a spam ({prob:.2f} confidence)"
    else:
        return f"The email in not spam ({1-prob:.2f} confidence)"



In [90]:
text = input()
print(predict_spam(str(text)))

Congratulations! you won 1000$ from us
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
The email is a spam (0.71 confidence)
