In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Define Data

For this example, we'll use a simplified dataset. If you have a specific spam dataset, you can use that, or I can guide you to download the famous SMS Spam Collection dataset.

Let's use a small sample dataset for now:

In [35]:
data = {
    "message": [
        "Win cash now", 
        "Claim your prize", 
        "Meeting at 3PM", 
        "Project update needed", 
        "Exclusive offer just for you", 
        "Call now to claim your reward",
        "Let's meet tomorrow",
        "Don't forget the team meeting",
        "Free entry in 2 million prize",
        "Your bill is due next week"
    ],
    "label": ["spam", "spam", "ham", "ham", "spam", "spam", "ham", "ham", "spam", "ham"]
}

df = pd.DataFrame(data)
display(df)

Unnamed: 0,message,label
0,Win cash now,spam
1,Claim your prize,spam
2,Meeting at 3PM,ham
3,Project update needed,ham
4,Exclusive offer just for you,spam
5,Call now to claim your reward,spam
6,Let's meet tomorrow,ham
7,Don't forget the team meeting,ham
8,Free entry in 2 million prize,spam
9,Your bill is due next week,ham


In [36]:
# 3. Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(df["message"], df["label"], test_size=0.3, random_state=42)
#4. Vectorize the Messages
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# 5. Train the Naive Bayes Model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)
# 6. Make Predictions and Evaluate the Model
predictions = model.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
report = classification_report(y_test, predictions, zero_division=0)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)




Accuracy: 0.0

Confusion Matrix:
 [[0 0]
 [3 0]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.00      0.00      0.00       0.0
        spam       0.00      0.00      0.00       3.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0



### Test Model

You can now test the model with some new messages:

In [37]:
test_messages = ["Win a free trip to Bahamas", "Let's have a call", "Claim your cash reward now"]
test_vectorized = vectorizer.transform(test_messages)
test_predictions = model.predict(test_vectorized)

for message, label in zip(test_messages, test_predictions):
    print(f"Message: '{message}' is classified as '{label}'")

Message: 'Win a free trip to Bahamas' is classified as 'spam'
Message: 'Let's have a call' is classified as 'ham'
Message: 'Claim your cash reward now' is classified as 'spam'


In [38]:
# Extract feature probabilities
feature_probs_spam = model.feature_log_prob_[0]
feature_probs_ham = model.feature_log_prob_[1]
features = vectorizer.get_feature_names_out()

# Create a DataFrame to display the word probabilities
word_probs_df = pd.DataFrame({
    "Word": features,
    "P(Word|Spam)": feature_probs_spam,
    "P(Word|Ham)": feature_probs_ham
})

display("Word Probabilities", word_probs_df)
print(f"Test Message: '{test_messages[0]}' is classified as '{test_predictions[0]}'")

'Word Probabilities'

Unnamed: 0,Word,P(Word|Spam),P(Word|Ham)
0,3pm,-3.157,-3.555348
1,at,-3.157,-3.555348
2,bill,-3.157,-3.555348
3,cash,-3.850148,-2.862201
4,don,-3.157,-3.555348
5,due,-3.157,-3.555348
6,exclusive,-3.850148,-2.862201
7,for,-3.850148,-2.862201
8,forget,-3.157,-3.555348
9,is,-3.157,-3.555348


Test Message: 'Win a free trip to Bahamas' is classified as 'spam'
