In [51]:
%pip install tensorflow

import pandas as pd
import tensorflow as tf 
from tensorflow.keras.models import Sequential

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [52]:
df = pd.read_csv('spam.csv', encoding='latin1')

In [53]:
print(df.columns.tolist())

['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']


In [54]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [55]:
df.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)

In [56]:
df['text'] = (
    df['text']
    .str.lower()  # lowercase
    .str.replace(r'\S+@\S+', '', regex=True)  # remove emails
    .str.replace(r'http\S+|www\S+|https\S+', '', regex=True)  # remove URLs
    .str.replace(r'[$€£¥₹]', '', regex=True)  # currency symbols
    .str.replace(r'\d+', '', regex=True)  # digits
    .str.replace(r'[^\w\s]', '', regex=True)  # punctuation
    .str.replace(r'\s+', ' ', regex=True)  # extra spaces
    .str.strip()
)

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [58]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', 
                                   classes=np.unique(y_train), 
                                   y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

print("Class distribution in training data:")
print(f"Ham (0): {sum(y_train == 0)}")
print(f"Spam (1): {sum(y_train == 1)}")
print(f"Class weights: {class_weight_dict}")


model = Sequential([
    tf.keras.layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', 'precision', 'recall']
)

print("\nTraining improved model...")
history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=64,
    validation_split=0.2,
    class_weight=class_weight_dict,
    verbose=1
)

Class distribution in training data:
Ham (0): 3860
Spam (1): 597
Class weights: {0: 0.5773316062176166, 1: 3.7328308207705194}

Training improved model...
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.5747 - loss: 0.6977 - precision: 0.1663 - recall: 0.5612 - val_accuracy: 0.8621 - val_loss: 0.5840 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9033 - loss: 0.2336 - precision: 0.5892 - recall: 0.9327 - val_accuracy: 0.8621 - val_loss: 0.3792 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.9589 - loss: 0.0891 - precision: 0.7706 - recall: 0.9766 - val_accuracy: 0.8621 - val_loss: 0.3869 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/30
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9830 - loss: 0.0469 - precision: 0.8971 - recall: 0.9926 - val_accuracy: 0.8621 - val_loss: 0.5445 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 5

In [61]:
from sklearn.metrics import accuracy_score, classification_report
# Evaluate the improved model
predictions_prob = model.predict(X_test)
predictions = (predictions_prob > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'\nModel Accuracy: {accuracy:.4f}')
print('\nModel Classification Report:')
print(classification_report(y_test, predictions, target_names=le.classes_))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Model Accuracy: 0.9731

Model Classification Report:
              precision    recall  f1-score   support

         ham       0.98      0.99      0.98       965
        spam       0.91      0.89      0.90       150

    accuracy                           0.97      1115
   macro avg       0.94      0.94      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [62]:
import re
import numpy as np

def predict_spam(text):
    # 1. Clean the text
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'\S+@\S+', '', cleaned_text)                     # remove emails
    cleaned_text = re.sub(r'http\S+|www\S+|https\S+', '', cleaned_text)    # remove URLs
    cleaned_text = re.sub(r'[$€£¥₹]', '', cleaned_text)                    # remove currency symbols
    cleaned_text = re.sub(r'\d+', '', cleaned_text)                        # remove digits
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', cleaned_text)                # remove special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()               # remove extra spaces

    # 2. Vectorize using the same fitted vectorizer
    text_vector = vectorizer.transform([cleaned_text]).toarray()

    # 3. Predict
    prediction = model.predict(text_vector)

    # 4. Interpret the prediction
    if prediction[0][0] > 0.5:
        return "Spam"
    else:
        return "Not Spam"

In [63]:
text = "Congratulations! You've won a $1000 cash prize. Click here to claim your reward."
result = predict_spam(text)
print(f'Text: "{text}"\nPrediction: {result}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Text: "Congratulations! You've won a $1000 cash prize. Click here to claim your reward."
Prediction: Spam


In [65]:
model.save('spam_classifier_model.keras')


In [66]:
import pickle

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
