In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


In [21]:
# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

# Convert labels to binary (ham = 0, spam = 1)
data['class'] = data['class'].map({'ham': 0, 'spam': 1})

# Preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize by splitting and remove stop words
    tokens = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)

data['message'] = data['message'].apply(preprocess_text)
data

Unnamed: 0,class,message
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor u c say
4,0,nah dont think goes usf lives
...,...,...
5567,1,nd time tried contact u u won pound prize clai...
5568,0,b going esplanade fr home
5569,0,pity mood soany suggestions
5570,0,guy did bitching acted like id interested buyi...


In [5]:
X = data['message']
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Parameters
max_words = 5000  # Max vocabulary size
max_len = 100     # Max length for padding

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences and pad them
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')


In [9]:
# Define the RNN model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [11]:
# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train_pad, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stopping])


Epoch 1/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 103ms/step - accuracy: 0.8464 - loss: 0.4695 - val_accuracy: 0.8621 - val_loss: 0.4017
Epoch 2/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 88ms/step - accuracy: 0.8754 - loss: 0.4070 - val_accuracy: 0.8621 - val_loss: 0.4017
Epoch 3/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.8605 - loss: 0.4245 - val_accuracy: 0.8621 - val_loss: 0.4045
Epoch 4/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.8754 - loss: 0.3961 - val_accuracy: 0.8621 - val_loss: 0.4014
Epoch 5/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 89ms/step - accuracy: 0.8643 - loss: 0.4116 - val_accuracy: 0.8621 - val_loss: 0.4012
Epoch 6/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.8702 - loss: 0.4024 - val_accuracy: 0.8621 - val_loss: 0.4013
Epoch 7/10
[1m

In [33]:
# Evaluate the model on test data
y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
data.head()

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Accuracy: 0.8654708520179372
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,class,message
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor u c say
4,0,nah dont think goes usf lives


In [15]:
def classify_message(model, tokenizer, message):
    # Preprocess the input message (same steps as training data)
    message = message.lower()
    message = re.sub(r'[^a-zA-Z\s]', '', message)
    tokens = [word for word in message.split() if word not in ENGLISH_STOP_WORDS]
    processed_message = ' '.join(tokens)
    
    # Tokenize and pad the message
    message_seq = tokenizer.texts_to_sequences([processed_message])
    message_pad = pad_sequences(message_seq, maxlen=max_len, padding='post')
    
    # Predict using the trained model
    prediction = model.predict(message_pad)
    return "spam" if prediction > 0.5 else "ham"


In [29]:
# Example messages
new_messages = ["Congratulations! You've won a $1000 Walmart gift card. Click here to claim.",
    "Are we still on for lunch tomorrow?",
    "Urgent: Your account is compromised. Please update your password immediately."]

# Classify each message
for msg in new_messages:
    result = classify_message(model, tokenizer, msg)
    print(f"Message: {msg}\nClassified as: {result}\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
Message: Congratulations! You've won a $1000 Walmart gift card. Click here to claim.
Classified as: ham

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
Message: Are we still on for lunch tomorrow?
Classified as: ham

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Message: Urgent: Your account is compromised. Please update your password immediately.
Classified as: ham

