In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, SpatialDropout1D, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Step 1: Load or Prepare the Dataset
# Load the dataset from an Excel file
# Replace 'spam_dataset.xlsx' with your actual file name
df = pd.read_excel("data.xlsx")

# Ensure the dataset has the correct columns (Column 1: label, Column 2: text)
df = df.rename(columns={df.columns[0]: "label", df.columns[1]: "text"})

# Convert label to binary (e.g., 'spam' -> 1, 'ham' -> 0)
df["label"] = df["label"].map({"spam": 1, "ham": 0})

# Convert text column to string type
df["text"] = df["text"].astype(str)

# Step 2: Text Preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df["cleaned_text"] = df["text"].apply(preprocess_text)

# Step 3: Tokenization and Padding
MAX_VOCAB_SIZE = 5000
MAX_SEQUENCE_LENGTH = 20

# Initialize Tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(df["cleaned_text"])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df["cleaned_text"])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df["label"], test_size=0.2, random_state=42)

# Step 5: Build the Enhanced LSTM Model
EMBEDDING_DIM = 50

model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    Dense(64, activation="relu"),
    Dropout(0.4),
    Dense(1, activation="sigmoid"),
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Step 6: Train the Model
model.fit(X_train, y_train, epochs=15, batch_size=16, validation_data=(X_test, y_test), verbose=1)

# Step 7: Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, y_pred, target_names=["Not Spam", "Spam"]))

# Step 8: Predict on User Input
while True:
    user_input = input("Enter the message to check whether it is spam or not (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting the spam classification system.")
        break

    # Preprocess the input
    user_input_cleaned = preprocess_text(user_input)
    user_input_sequence = tokenizer.texts_to_sequences([user_input_cleaned])
    user_input_padded = pad_sequences(user_input_sequence, maxlen=MAX_SEQUENCE_LENGTH)

    # Predict
    prediction = model.predict(user_input_padded)
    label = (prediction > 0.5).astype(int)  # 1 = Spam, 0 = Not Spam

    print(f"Prediction: {'Spam' if label[0][0] == 1 else 'Not Spam'}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Epoch 1/15




[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 31ms/step - accuracy: 0.8986 - loss: 0.3229 - val_accuracy: 0.9874 - val_loss: 0.0562
Epoch 2/15
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.9835 - loss: 0.0711 - val_accuracy: 0.9865 - val_loss: 0.0509
Epoch 3/15
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.9909 - loss: 0.0322 - val_accuracy: 0.9857 - val_loss: 0.0559
Epoch 4/15
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - accuracy: 0.9971 - loss: 0.0163 - val_accuracy: 0.9830 - val_loss: 0.0581
Epoch 5/15
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.9960 - loss: 0.0164 - val_accuracy: 0.9794 - val_loss: 0.0749
Epoch 6/15
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 26ms/step - accuracy: 0.9975 - loss: 0.0100 - val_accuracy: 0.9883 - val_loss: 0.0611
Epoch 7/15
[1m279/279[0m [3