In [3]:
# ==========================
# 📘 Spam Email Detection Model
# ==========================

# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Load Dataset
# You can use the popular 'spam.csv' dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

print("✅ Dataset Loaded Successfully!")
print(df.head())

# Step 3: Encode Labels
df['label_num'] = df.label.map({'ham': 0, 'spam': 1})

# Step 4: Split into Training and Testing Data
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label_num'], test_size=0.2, random_state=42)

# Step 5: Convert Text to Numerical Data
cv = CountVectorizer(stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

# Step 6: Train the Model
model = MultinomialNB()
model.fit(X_train_cv, y_train)

# Step 7: Predict
y_pred = model.predict(X_test_cv)

# Step 8: Evaluate Model
print("\n📊 Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 9: Try with New Messages
sample_emails = [
    "Congratulations! You have won a $1000 Walmart gift card. Click here to claim now!",
    "Hey, are we still meeting for lunch tomorrow?",
    "URGENT! Your account has been compromised, verify immediately."
]

sample_transformed = cv.transform(sample_emails)
sample_pred = model.predict(sample_transformed)

print("\n📧 Sample Predictions:")
for email, pred in zip(sample_emails, sample_pred):
    print(f"Message: {email}\nPrediction: {'spam' if pred else 'Ham'}\n")
# Step 10: Print English summary of results
acc = accuracy_score(y_test, y_pred)

print("\n📘 Model Summary (in plain English):")
print(f"The spam detection model achieved an accuracy of {acc*100:.2f}%.")
print("This means the model correctly identifies whether an email is spam or not in about "
      f"{acc*100:.1f}% of the cases.\n")

print("✅ 'Ham' means a normal/non-spam message.")
print("🚫 'Spam' means a suspicious or promotional message.\n")

print("Below are the predictions for a few sample emails:\n")
for email, pred in zip(sample_emails, sample_pred):
    label = "SPAM 🚫" if pred else "NOT SPAM ✅"
    print(f"Email: {email}\n→ Prediction: This email is {label}\n")

✅ Dataset Loaded Successfully!
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

📊 Model Evaluation:
Accuracy: 0.9883408071748879

Confusion Matrix:
 [[961   5]
 [  8 141]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.97      0.95      0.96       149

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115


📧 Sample Predictions:
Message: Congratulations! You have won a $1000 Walmart gift card. Click here to claim now!
Prediction: spam

Message: Hey, are we still meeting for