## Email Spam Classifier

### libraries needed

In [9]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Data Preparation

In [10]:
emails = [
    # SPAM
    "Get rich quick! Buy now!",
    "Win a free iPhone today!",
    "Free money, act now!",
    "Congratulations! You've won!",
    "Click here to claim your prize",
    "Earn $$$ fast with this trick",
    "This is not a scam, click now",
    
    # NOT SPAM
    "Meeting at 3pm tomorrow",
    "Project deadline reminder",
    "Team lunch next week",
    "Team meeting scheduled for Friday",
    "Don't forget our call at noon",
    "Please review the attached file",
    "Weekly project status update"
]
labels = [
    1, 1, 1, 1, 1, 1, 1,   # Spam
    0, 0, 0, 0, 0, 0, 0    # Not spam
]
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    emails, labels, test_size=0.2, random_state=42
)

### Text Processing pipeline

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

def create_spam_classifier():
    return Pipeline([
        ('vectorizer', TfidfVectorizer(
            lowercase=True,
            stop_words='english',
            ngram_range=(1, 2),     # unigrams + bigrams
            min_df=1,               # keep rare terms
            max_df=0.9,             # ignore very frequent words
            sublinear_tf=True       # logarithmic scaling of term frequency
        )),
        ('classifier', MultinomialNB(alpha=0.5))  # Lower alpha = less smoothing
    ])


### Train and Evaluate the Model

In [12]:
# Create and train the model
model = create_spam_classifier()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.6666666666666666

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



In [13]:
# Test with new emails
new_emails = [
    "Congratulations! You've won a prize!",
    "Team meeting scheduled for Friday"
]

# Make predictions
predictions = model.predict(new_emails)
probabilities = model.predict_proba(new_emails)

# Print results
for email, pred, prob in zip(new_emails, predictions, probabilities):
    print(f"\nEmail: {email}")
    print(f"Prediction: {'Spam' if pred == 1 else 'Not Spam'}")
    print(f"Confidence: {max(prob):.2%}")


Email: Congratulations! You've won a prize!
Prediction: Spam
Confidence: 84.19%

Email: Team meeting scheduled for Friday
Prediction: Not Spam
Confidence: 82.07%
