In [None]:
# Project Title: Automated Spam Email Detection

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset?dataset_version_number=1...


100%|██████████| 211k/211k [00:00<00:00, 254kB/s]

Extracting files...
Path to dataset files: C:\Users\Saikat\.cache\kagglehub\datasets\uciml\sms-spam-collection-dataset\versions\1





In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Download stopwords from NLTK
nltk.download('stopwords')
from nltk.corpus import stopwords

# 📁 Load dataset
df = pd.read_csv(
    r'C:\\Users\\Saikat\\.cache\\kagglehub\\datasets\\uciml\\sms-spam-collection-dataset\\versions\\1\\spam.csv',
    encoding='latin-1'
)[['v1', 'v2']]
df.columns = ['label', 'text']  # Rename columns

# 🔍 Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub(r'<.*?>', '', text)  # remove HTML
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)  # remove punctuation
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]
    return ' '.join(words)

# 🧹 Clean the text data
df['clean_text'] = df['text'].apply(clean_text)

# 🎯 Encode labels: ham=0, spam=1
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# 📊 Feature extraction
X = df['clean_text']
y = df['label_num']
cv = CountVectorizer()
X_vectorized = cv.fit_transform(X)

# 🧪 Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

# 🏗️ Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# ✅ Evaluate the model
y_pred = model.predict(X_test)
print("📈 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📝 Classification Report:\n", classification_report(y_test, y_pred))

# 🔍 Test on a sample email
def predict_email(text):
    text = clean_text(text)
    vector = cv.transform([text])
    prediction = model.predict(vector)[0]
    return "Spam" if prediction else "Not Spam"

# 🧪 Example
sample_email = "Congratulations! You've won a $1000 Walmart gift card. Click here to claim now."
result = predict_email(sample_email)
print(f"\n📤 Sample prediction:\n'{sample_email}' --> {result}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saikat\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


📈 Accuracy: 0.9739910313901345

📊 Confusion Matrix:
 [[949  16]
 [ 13 137]]

📝 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       965
           1       0.90      0.91      0.90       150

    accuracy                           0.97      1115
   macro avg       0.94      0.95      0.94      1115
weighted avg       0.97      0.97      0.97      1115


📤 Sample prediction:
'Congratulations! You've won a $1000 Walmart gift card. Click here to claim now.' --> Spam
