In [3]:
from typing import NamedTuple, Iterable, Set, Dict, Tuple
from collections import defaultdict
import math
import re

# ------------------------------
# Step 1: Define Message format
# ------------------------------
class Message(NamedTuple):
    text: str
    is_spam: bool

# ------------------------------
# Step 2: Tokenizer
# ------------------------------
def tokenize(text: str) -> Set[str]:
    text = text.lower()
    all_words = re.findall("[a-z0-9']+", text)
    return set(all_words)

# ------------------------------
# Step 3: Naive Bayes Classifier
# ------------------------------
class NaiveBayesClassifier:
    def __init__(self, k: float = 0.5) -> None:
        self.k = k
        self.tokens: Set[str] = set()
        self.token_spam_counts: Dict[str, int] = defaultdict(int)
        self.token_ham_counts: Dict[str, int] = defaultdict(int)
        self.spam_messages = 0
        self.ham_messages = 0

    def train(self, messages: Iterable[Message]) -> None:
        for message in messages:
            if message.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1

            for token in tokenize(message.text):
                self.tokens.add(token)
                if message.is_spam:
                    self.token_spam_counts[token] += 1
                else:
                    self.token_ham_counts[token] += 1

    def _probabilities(self, token: str) -> Tuple[float, float]:
        spam = self.token_spam_counts[token]
        ham = self.token_ham_counts[token]

        p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
        p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)

        return p_token_spam, p_token_ham

    def predict(self, text: str) -> float:
        text_tokens = tokenize(text)
        log_prob_if_spam = log_prob_if_ham = 0.0

        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._probabilities(token)

            if token in text_tokens:
                log_prob_if_spam += math.log(prob_if_spam)
                log_prob_if_ham += math.log(prob_if_ham)
            else:
                log_prob_if_spam += math.log(1.0 - prob_if_spam)
                log_prob_if_ham += math.log(1.0 - prob_if_ham)

        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_ham = math.exp(log_prob_if_ham)

        return prob_if_spam / (prob_if_spam + prob_if_ham)

In [4]:
# ------------------------------
# Step 4: Testing the Classifier
# ------------------------------
if __name__ == "__main__":
    # Sample training data
    training_messages = [
        Message("Win money now!", True),
        Message("Claim your free prize", True),
        Message("Lowest price for your meds", True),
        Message("Hey, are we still on for lunch?", False),
        Message("Don't forget the meeting tomorrow", False),
        Message("Can you review my code?", False)
    ]

    # Initialize and train
    classifier = NaiveBayesClassifier()
    classifier.train(training_messages)

    # Test predictions
    test_messages = [
        "Free money for you",
        "Are we meeting for lunch?",
        "Win a free iPhone",
        "Reminder: project meeting tomorrow",
        "Congratulations! Claim your prize now"
    ]

    print("Predictions:\n")
    for msg in test_messages:
        prob = classifier.predict(msg)
        print(f"{msg}\n→ Spam Probability: {prob:.4f}\n")


Predictions:

Free money for you
→ Spam Probability: 0.9499

Are we meeting for lunch?
→ Spam Probability: 0.0143

Win a free iPhone
→ Spam Probability: 0.9876

Reminder: project meeting tomorrow
→ Spam Probability: 0.2039

Congratulations! Claim your prize now
→ Spam Probability: 0.9997

