In [3]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [4]:

#LOAD DATASETS
spam_df = pd.read_csv("spam.csv", encoding="latin-1")

if "v1" in spam_df.columns:
    spam_df = spam_df[["v1", "v2"]]
    spam_df.columns = ["label", "message"]

indian_df = pd.read_excel("revisedindiandataset.xls")

if "code" in indian_df.columns:
    indian_df = indian_df.drop(columns=["code"])

if "msg" in indian_df.columns:
    indian_df = indian_df.rename(columns={"msg": "message"})

# Standardize labels
spam_df["label"] = spam_df["label"].map({"ham": 0, "spam": 1})
indian_df["label"] = indian_df["label"].map({"ham": 0, "spam": 1})

spam_df = spam_df[["message", "label"]].dropna()
indian_df = indian_df[["message", "label"]].dropna()

# Combine
df = pd.concat([spam_df, indian_df], ignore_index=True)
df = df.drop_duplicates(subset="message")

print("Total samples:", df.shape[0])
print(df["label"].value_counts())

Total samples: 8994
label
0    7376
1    1618
Name: count, dtype: int64


In [5]:
#CLEAN TEXT
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

df["message"] = df["message"].apply(clean_text)

In [6]:
#TRAIN TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    df["message"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)


In [7]:

#TF-IDF
vectorizer = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1,2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [8]:

#DEFINE MODELS

models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", max_iter=1000),
    "Linear SVM": LinearSVC(class_weight="balanced"),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

In [9]:
#TRAIN & COMPARE

results = []

for name, model in models.items():
    print("\n==============================")
    print(f"Training: {name}")

    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    spam_precision = report["1"]["precision"]
    spam_recall = report["1"]["recall"]

    print(f"Accuracy: {acc:.4f}")
    print(f"Spam Precision: {spam_precision:.4f}")
    print(f"Spam Recall: {spam_recall:.4f}")

    results.append((name, acc, spam_precision, spam_recall))


Training: Logistic Regression
Accuracy: 0.9611
Spam Precision: 0.8735
Spam Recall: 0.9167

Training: Linear SVM
Accuracy: 0.9666
Spam Precision: 0.9099
Spam Recall: 0.9043

Training: Naive Bayes
Accuracy: 0.9555
Spam Precision: 0.9841
Spam Recall: 0.7654

Training: Random Forest
Accuracy: 0.9566
Spam Precision: 0.9805
Spam Recall: 0.7747


In [10]:
#FINAL COMPARISON

print("\n\n==== FINAL MODEL COMPARISON ====\n")
for r in results:
    print(f"{r[0]} -> Accuracy: {r[1]:.4f}, Spam Precision: {r[2]:.4f}, Spam Recall: {r[3]:.4f}")



==== FINAL MODEL COMPARISON ====

Logistic Regression -> Accuracy: 0.9611, Spam Precision: 0.8735, Spam Recall: 0.9167
Linear SVM -> Accuracy: 0.9666, Spam Precision: 0.9099, Spam Recall: 0.9043
Naive Bayes -> Accuracy: 0.9555, Spam Precision: 0.9841, Spam Recall: 0.7654
Random Forest -> Accuracy: 0.9566, Spam Precision: 0.9805, Spam Recall: 0.7747
