In [1]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Download NLTK resources (if not already downloaded)
nltk.download('stopwords')

from nltk.corpus import stopwords

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', text)
    text = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', text)
    text = re.sub(r'£|\$', 'moneysymb', text)
    text = re.sub(r'\b(\+\d{1,2}\s?)?\d?(\(\d{3}\))?[0-9.\-]+\d{1,9}\b', 'phonenumbr', text)
    text = re.sub(r'\d+(\.\d+)?', 'numbr', text)
    text = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', text)
    text = re.sub(r'\b(U|u)\s?S|U\s?S\s?D\s?\S*?\s?D\s?ollar(s)?\b', 'usdollar', text)
    text = re.sub(r'\b(£|pounds?)\s?\S*?\s?(\d+(\.\d+)?)?\s?[mMkKbB]{0,1}\b', 'ukpound', text)
    text = re.sub(r'\b\d{1,2}:\d{2}(\s?[AaPp][Mm])?\b', 'timeofday', text)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text.lower()

# Load and preprocess data
df = pd.read_csv('spam.csv', encoding='latin1')
processed_emails = [preprocess_text(email) for email in df['v2']]
labels = df['v1'].map({'ham': 0, 'spam': 1})  # Mapping 'ham' to 0 and 'spam' to 1

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(processed_emails, labels, test_size=0.2, random_state=42)

# Feature extraction using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Make predictions
predictions = model.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_rep}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sriva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9847533632286996
Confusion Matrix:
[[956   9]
 [  8 142]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.94      0.95      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115

