In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
# Sample dataset of emails and labels
data = {
    "Email": [
        "Congratulations, you have won a lottery!", 
        "Reminder: Meeting at 10 AM tomorrow.", 
        "Exclusive offer for you! Buy now and save.", 
        "How about lunch at 1 PM today?", 
        "Claim your free gift today, limited offer!", 
        "Can we reschedule the meeting to next week?",
        "Get a free vacation package! Act now.",
        "Let me know if you are available for the meeting."
    ],
    "Label": ["spam", "not spam", "spam", "not spam", "spam", "not spam", "spam", "not spam"]
}
# Convert the data to a DataFrame
df = pd.DataFrame(data)
# Features and target
X = df["Email"]  # Email content
y = df["Label"]  # Labels for spam or not spam
# Convert labels to numeric values: spam = 1, not spam = 0
y = y.map({"spam": 1, "not spam": 0})
# Text preprocessing: Clean text, remove stopwords, and apply stemming
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
def preprocess_text(text):
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords and apply stemming
    text = " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text
# Apply preprocessing to the emails
X_cleaned = X.apply(preprocess_text)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.25, random_state=42)
# Convert the email text into numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
# Initialize and train the Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
# Evaluate the model without cross-validation (just a single train-test split)
y_pred = nb.predict(X_test_tfidf)
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
# Display evaluation results
print("Accuracy of the Email Spam Classifier:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
# New email prediction
new_email = ["Exclusive offer! Get your free iPhone now!"]
new_email_tfidf = vectorizer.transform(new_email)
# Predict if the new email is spam or not
prediction = nb.predict(new_email_tfidf)
spam_status = 'spam' if prediction == 1 else 'not spam'
# Display the new email content and its classification
print(f"\nThe new email is: {new_email[0]}")
print(f"The new email is classified as: {spam_status}")

Accuracy of the Email Spam Classifier: 0.0

Confusion Matrix:
 [[0 2]
 [0 0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0


The new email is: Exclusive offer! Get your free iPhone now!
The new email is classified as: spam


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
