In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB


In [None]:
###
#Spam Detection Project
#This notebook builds a Logistic Regression model to classify emails as spam or ham. Includes preprocessing, model training, evaluation, and error analysis.


df = pd.read_csv("D:\\Python\\Spam Classifier Project\\Data\\spam.csv", encoding="latin-1")
df = df[["v1", "v2"]]
df.columns = ["label", "message"]
df.head()
df.isna().sum()
df["label"].value_counts()
# Encode labels: ham=0, spam=1
df["label_num"] = df["label"].map({"ham": 0, "spam": 1})
df.head()
X = df["message"]
y = df["label_num"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
y_train.value_counts(normalize=True)
y_test.value_counts(normalize=True)
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    max_features=5000,
    ngram_range=(1, 2)
)


X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
cm
results = pd.DataFrame({
    "message": X_test,
    "true_label": y_test,
    "pred_label": y_pred
})
##False Negatives Analysis
#Most false negatives are short or subtle spam messages, often promotional or containing codes.

false_negatives = results[
    (results["true_label"] == 1) &
    (results["pred_label"] == 0)
]
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
print("NB Accuracy:", accuracy_score(y_test, y_pred_nb))
false_negatives.head(10)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show() 


