In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv("spam.csv", encoding="latin-1")




In [9]:
df = df[["v1", "v2"]]
df = df.rename(columns={"v1": "label", "v2": "text"})

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# ----------------------------
# spam -> 1, ham -> 0
df["label"] = df["label"].map({"ham": 0, "spam": 1})

# ----------------------------
# 2. Train-test split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

# ----------------------------
# 3. Models to compare
# ----------------------------
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# ----------------------------
# 4. Train & Evaluate
# ----------------------------
results = []

for name, model in models.items():
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english")),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

# ----------------------------
# 5. Display results
# ----------------------------
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)

print("\nModel Comparison Results:\n")
print(results_df)




Model Comparison Results:

                 Model  Accuracy  Precision    Recall  F1 Score
2           Linear SVM  0.983857   0.992481  0.885906  0.936170
3        Random Forest  0.974888   1.000000  0.812081  0.896296
0          Naive Bayes  0.968610   1.000000  0.765101  0.866920
1  Logistic Regression  0.967713   1.000000  0.758389  0.862595
4    Gradient Boosting  0.961435   0.990741  0.718121  0.832685


In [None]:
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense, Embedding, LSTM, Bidirectional,
    Conv1D, GlobalMaxPooling1D, Dropout
)

# ----------------------------
# Tokenization & Padding
# ----------------------------
VOCAB_SIZE = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

# ----------------------------
# 4. Model builders
# ----------------------------
def dense_model():
    model = Sequential([
        Embedding(VOCAB_SIZE, 128, input_length=MAX_LEN),
        GlobalMaxPooling1D(),
        Dense(64, activation="relu"),
        Dropout(0.5),
        Dense(1, activation="sigmoid")
    ])
    return model

def cnn_model():
    model = Sequential([
        Embedding(VOCAB_SIZE, 128, input_length=MAX_LEN),
        Conv1D(128, 5, activation="relu"),
        GlobalMaxPooling1D(),
        Dense(64, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    return model

def lstm_model():
    model = Sequential([
        Embedding(VOCAB_SIZE, 128, input_length=MAX_LEN),
        LSTM(128),
        Dense(1, activation="sigmoid")
    ])
    return model

def bi_lstm_model():
    model = Sequential([
        Embedding(VOCAB_SIZE, 128, input_length=MAX_LEN),
        Bidirectional(LSTM(128)),
        Dense(1, activation="sigmoid")
    ])
    return model

# ----------------------------
# 5. Train & Evaluate
# ----------------------------
models = {
    "Dense NN": dense_model(),
    "CNN": cnn_model(),
    "LSTM": lstm_model(),
    "Bi-LSTM": bi_lstm_model()
}

results = []

for name, model in models.items():
    print(f"\nTraining {name}...")

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )

    model.fit(
        X_train_pad, y_train,
        epochs=5,
        batch_size=32,
        validation_split=0.1,
        verbose=0
    )

    y_pred = (model.predict(X_test_pad) > 0.5).astype(int)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

# ----------------------------
# 6. Results
# ----------------------------
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)
print("\nDeep Learning Model Comparison:\n")
print(results_df)





Training Dense NN...
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step

Training CNN...
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step

Training LSTM...
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step

Training Bi-LSTM...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 101ms/step

Deep Learning Model Comparison:

      Model  Accuracy  Precision    Recall  F1 Score
1       CNN  0.988341   0.972222  0.939597  0.955631
0  Dense NN  0.988341   1.000000  0.912752  0.954386
3   Bi-LSTM  0.988341   1.000000  0.912752  0.954386
2      LSTM  0.866368   0.000000  0.000000  0.000000
