In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
df = pd.read_csv("combined_data.csv")

In [3]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [4]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV')
tokenizer.fit_on_texts(x_train)

In [5]:
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [6]:
max_len = 20
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding='post')

In [7]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_len),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
model.fit(x_train_pad, y_train, epochs=5, batch_size=4, validation_data=(x_test_pad, y_test))

Epoch 1/5
[1m16690/16690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 11ms/step - accuracy: 0.9241 - loss: 0.1877 - val_accuracy: 0.9645 - val_loss: 0.0975
Epoch 2/5
[1m16690/16690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 12ms/step - accuracy: 0.9739 - loss: 0.0715 - val_accuracy: 0.9679 - val_loss: 0.0877
Epoch 3/5
[1m16690/16690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 13ms/step - accuracy: 0.9830 - loss: 0.0473 - val_accuracy: 0.9679 - val_loss: 0.0936
Epoch 4/5
[1m16690/16690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 11ms/step - accuracy: 0.9890 - loss: 0.0322 - val_accuracy: 0.9668 - val_loss: 0.1071
Epoch 5/5
[1m16690/16690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 9ms/step - accuracy: 0.9924 - loss: 0.0223 - val_accuracy: 0.9656 - val_loss: 0.1323


<keras.src.callbacks.history.History at 0x154bcd626c0>

In [10]:
loss, accuracy = model.evaluate(x_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9663 - loss: 0.1248
Test Accuracy: 0.9656


In [11]:
# def predict(text):
#     seq = tokenizer.texts_to_sequences([text])
#     padded = pad_sequences(seq, maxlen=max_len, padding='post')
#     pred = model.predict(padded)[0][0]
#     return "Spam" if pred > 0.5 else "Not Spam"

In [12]:
# # Example 1: Obvious spam
# print("Test 1:", predict("Congratulations! You've won a free iPhone. Click here to claim now."))

# # Example 2: Normal message
# print("Test 2:", predict("Hey, can we reschedule our meeting for tomorrow?"))

# # Example 3: Spam with urgency
# print("Test 3:", predict("Urgent! Your bank account has been compromised. Login now."))

# # Example 4: Friendly message
# print("Test 4:", predict("Let's grab lunch this afternoon."))

# # Example 5: Contest/Prize offer
# print("Test 5:", predict("Win ₹1,00,000 cash prize! Enter the contest today."))

In [13]:
# import gradio as gr

# interface = gr.Interface(
#     fn=predict,
#     inputs=gr.Textbox(lines=4, placeholder="Enter a message here..."),
#     outputs="text",
#     title="Spam Message Classifier",
#     description="Enter a message and the model will predict whether it's Spam or Not Spam."
# )

# interface.launch()

In [14]:
model.save("Spam-Email-Classifier.h5")



In [15]:
import pickle

In [17]:
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [18]:
from tensorflow.keras.models import load_model

In [19]:
model = load_model("Spam-Email-Classifier.h5")



In [20]:
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)