In [None]:
# Spam Detection Using Bernoulli Naïve Bayes
# example from chatGPT

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Sample dataset: Spam (1) or Ham (0)
emails = [
    "Win a free lottery now",  # Spam
    "Your invoice is attached",  # Ham
    "Claim your free prize",  # Spam
    "Meeting at 10 AM",  # Ham
    "Congratulations! You won a gift card",  # Spam
    "Let's schedule a call",  # Ham
]

labels = [1, 0, 1, 0, 1, 0]  # 1 = Spam, 0 = Ham


In [None]:
# Convert text to binary features (word presence)
vectorizer = CountVectorizer(binary=True)  # Convert words into binary values (0 or 1)
X = vectorizer.fit_transform(emails)

In [None]:
X

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

In [None]:
print(X_train)

In [None]:
# Train Bernoulli Naïve Bayes model
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = bnb.predict(X_test)

# Evaluate accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

In [None]:
# Test on new email
new_email = ["You have won a free vacation"]
X_new = vectorizer.transform(new_email)
prediction = bnb.predict(X_new)
print("Spam" if prediction[0] == 1 else "Not Spam")

In [None]:
# another example for transform text to sparse matrix 
emails = [
    "Win a free lottery",  
    "Your invoice is attached",  
    "Claim your free prize"
]

# Convert text to binary word presence
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(emails)

In [None]:
# Print matrix
print(X.toarray())  # Convert sparse matrix to dense format

In [None]:
# check the vocabulary mapping using
print(vectorizer.get_feature_names_out())

[[0 0 1 0 0 1 1 0]  # "Win a free lottery"
 [1 0 0 1 1 0 0 1]  # "Your invoice is attached"
 [0 1 1 0 0 0 0 1]] # "Claim your free prize"