In [None]:
from IPython.display import HTML

url = "https://spamdetectionbyluvgoel.netlify.app//"

HTML(f"""
<style>@keyframes g{{0%,100%{{background-position:0 50%}}50%{{background-position:100% 50%}}}}</style>
<div style="text-align:center;padding:20px;background:linear-gradient(135deg,#667eea,#764ba2,#667eea);background-size:200%;animation:g 4s ease infinite;border-radius:10px">
<h2 style="color:#fff;margin:0 0 5px">Spam Detector by Luv Goel</h2><br>
<a href="{url}" target="_blank" style="display:inline-block;padding:15px 40px;background:#fff;color:#667eea;font-weight:bold;font-size:16px;border-radius:10px;box-shadow:0 5px 15px rgba(0,0,0,.3);text-decoration:none;transition:transform .3s" onmouseover="this.style.transform='scale(1.05)'" onmouseout="this.style.transform='scale(1)'">Click Here to Open Website</a>
</div>
""")

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Sample dataset - spam emails (label 1) and legitimate emails (label 0)
emails = [
    # Spam examples
    "WIN FREE MONEY NOW!!! Click here to claim your prize",
    "Congratulations! You've won $1000000. Send your bank details",
    "URGENT: Your account will be closed. Verify now!",
    "Get rich quick! Make money fast from home",
    "Free viagra pills! Order now! Limited offer!!!",
    "You have inherited millions! Contact us immediately",
    "CLICK HERE for amazing discounts! Buy now!!!",
    "Lose weight fast! Miracle pills available",
    "Your loan has been approved! No credit check needed",
    "Act now! Limited time offer. Free gift inside",
    "Dear friend, I am a prince and need your help transferring money",
    "Congratulations winner! Claim your free iPhone now",
    "Work from home and earn thousands per week",
    "URGENT RESPONSE REQUIRED! Your package is waiting",
    "FREE CREDIT CARD! Apply now with no fees",
    "Make millions online! No experience needed!!!",
    "You are a lucky winner! Claim your prize money",
    "Buy cheap medications online. No prescription needed",
    "Earn extra cash working from home today",
    "Your credit score is terrible! Fix it now",

    # Legitimate emails
    "Hi, can we schedule a meeting tomorrow at 2pm?",
    "Thanks for your email. I'll review the document and get back to you",
    "The project deadline has been moved to next Friday",
    "Please find attached the quarterly report for your review",
    "Would you like to grab lunch next week?",
    "The team meeting is scheduled for Monday morning",
    "I've completed the analysis. Let me know if you need anything else",
    "Your order has been shipped and will arrive in 3-5 business days",
    "Thank you for your purchase. Here is your receipt",
    "Reminder: Your subscription renews on March 15th",
    "Hi Mom, hope you're doing well. Talk soon!",
    "The conference call will start in 10 minutes",
    "I appreciate your help with this project",
    "Your appointment is confirmed for Thursday at 3pm",
    "Let me know if you have any questions about the proposal",
    "I reviewed the presentation and it looks great",
    "Can you send me the latest version of the report?",
    "The budget approval is pending with management",
    "Thank you for attending today's workshop",
    "Please update your contact information in the system"
]

# Labels: 1 = Spam, 0 = Legitimate (Ham)
labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 20 spam
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  # 20 ham

print("SPAM EMAIL DETECTION SYSTEM")
print(f"\nTotal emails in dataset: {len(emails)}")
print(f"Spam emails: {sum(labels)}")
print(f"Legitimate emails: {len(labels) - sum(labels)}")
print()

SPAM EMAIL DETECTION SYSTEM

Total emails in dataset: 40
Spam emails: 20
Legitimate emails: 20



In [None]:
# Convert text to numerical features using CountVectorizer
vectorizer = CountVectorizer(lowercase=True, stop_words='english')
X = vectorizer.fit_transform(emails)

print("Text converted to numerical features")
print(f"Feature vector shape: {X.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print()

Text converted to numerical features
Feature vector shape: (40, 149)
Vocabulary size: 149



In [None]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} emails")
print(f"Testing set size: {X_test.shape[0]} emails")
print()

Training set size: 32 emails
Testing set size: 8 emails



NAIVE BAYES CLASSIFIER

In [None]:
# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [None]:
# Make predictions
nb_predictions = nb_model.predict(X_test)

In [None]:
# Calculate accuracy
nb_accuracy = accuracy_score(y_test, nb_predictions)

print(f"\nAccuracy: {nb_accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, nb_predictions, target_names=['Legitimate', 'Spam']))

print("Confusion Matrix:")
nb_cm = confusion_matrix(y_test, nb_predictions)
print(nb_cm)
print()


Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00         3
        Spam       1.00      1.00      1.00         5

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8

Confusion Matrix:
[[3 0]
 [0 5]]



SVM CLASSIFIER

In [None]:
# Train SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

In [None]:
# Make predictions
svm_predictions = svm_model.predict(X_test)

In [None]:
# Calculate accuracy
svm_accuracy = accuracy_score(y_test, svm_predictions)

print(f"\nAccuracy: {svm_accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, svm_predictions, target_names=['Legitimate', 'Spam']))

print("Confusion Matrix:")
svm_cm = confusion_matrix(y_test, svm_predictions)
print(svm_cm)
print()


Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00         3
        Spam       1.00      1.00      1.00         5

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8

Confusion Matrix:
[[3 0]
 [0 5]]



In [None]:
print("TESTING WITH NEW EMAILS")

# Test emails
test_emails = [
    "Congratulations! You won a free vacation. Click now!",
    "Hi John, let's meet for coffee tomorrow afternoon",
    "URGENT! Your account needs verification immediately!!!",
    "The meeting has been rescheduled to next Tuesday"
]

print("\nTesting new emails:")
print("-" * 60)

for i, email in enumerate(test_emails, 1):
    # Transform the test email
    email_vector = vectorizer.transform([email])

    # Predict using both models
    nb_pred = nb_model.predict(email_vector)[0]
    svm_pred = svm_model.predict(email_vector)[0]

    # Get prediction probabilities for Naive Bayes
    nb_prob = nb_model.predict_proba(email_vector)[0]

    print(f"\nEmail {i}: {email}")
    print(f"  Naive Bayes: {'SPAM' if nb_pred == 1 else 'LEGITIMATE'} (confidence: {max(nb_prob)*100:.1f}%)")
    print(f"  SVM: {'SPAM' if svm_pred == 1 else 'LEGITIMATE'}")

TESTING WITH NEW EMAILS

Testing new emails:
------------------------------------------------------------

Email 1: Congratulations! You won a free vacation. Click now!
  Naive Bayes: SPAM (confidence: 98.5%)
  SVM: SPAM

Email 2: Hi John, let's meet for coffee tomorrow afternoon
  Naive Bayes: LEGITIMATE (confidence: 94.1%)
  SVM: LEGITIMATE

Email 3: URGENT! Your account needs verification immediately!!!
  Naive Bayes: SPAM (confidence: 90.0%)
  SVM: SPAM

Email 4: The meeting has been rescheduled to next Tuesday
  Naive Bayes: LEGITIMATE (confidence: 78.2%)
  SVM: LEGITIMATE


In [None]:
print("MODEL COMPARISON")
print(f"\nNaive Bayes Accuracy: {nb_accuracy * 100:.2f}%")
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")

if nb_accuracy > svm_accuracy:
    print("\n✓ Naive Bayes performed better on this dataset")
elif svm_accuracy > nb_accuracy:
    print("\n✓ SVM performed better on this dataset")
else:
    print("\n✓ Both models performed equally well")

print("ANALYSIS COMPLETE")


MODEL COMPARISON

Naive Bayes Accuracy: 100.00%
SVM Accuracy: 100.00%

✓ Both models performed equally well
ANALYSIS COMPLETE
