# Naïve Bayes for spam detection

Here I develop a Naïve Bayes algorithm to detect spam in emails

The notebook is divided into 3 parts

   1. Data pre-processing
   2. Building a Naïve Bayes model for spam classification
   3. Model implementation


## 1) Data pre-processing

In [14]:
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import string
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /Users/seymour/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seymour/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
data_dir = '/Users/seymour/Downloads/enron1'
class_names = ['ham', 'spam']


In [16]:
#Define function to process text
def preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    text = ' '.join(tokens)
    return text

In [17]:
# Create a list to store the preprocessed text data and labels
data = []
labels = []

for i, class_name in enumerate(class_names):
    class_dir = os.path.join(data_dir, class_name)
    for file_name in os.listdir(class_dir):
        with open(os.path.join(class_dir, file_name), 'r', encoding='ISO-8859-1') as f:
            text = f.read()
        preprocessed_text = preprocess(text)
        data.append(preprocessed_text)
        labels.append(i)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2) Building a Naïve Bayes model for spam classification

In [18]:
alpha_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_alpha = 0
for alpha in alpha_values:
    model = MultinomialNB(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_alpha = alpha

model = MultinomialNB(alpha=best_alpha)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the testing set: {accuracy:.3f}")

Accuracy on the testing set: 0.983


## 3) Model implementation

In [19]:
# Test the model on new data
def test_text_classifier(new_text):
    new_text_processed = preprocess(new_text)
    new_text_vectorized = vectorizer.transform([new_text_processed])
    prediction = model.predict(new_text_vectorized)[0]
    if prediction == 1:
        print(f"The message '{new_text}' is classified as spam.")
    else:
        print(f"The message '{new_text}' is not classified as spam.")


In [20]:
test_text_classifier("Hi did you get my email. Please update James asap!")

The message 'Hi did you get my email. Please update James asap!' is not classified as spam.


In [21]:
test_text_classifier("Hi you have won a price! Contact me for more information")

The message 'Hi you have won a price! Contact me for more information' is classified as spam.
