In [None]:
# ============================================
# Assignment: Naive Bayes Email Classifier (No sklearn)
# ============================================

# 1. Email dataset
emails = [
    ("Congratulations! You have won a free lottery ticket", "spam"),
    ("Win cash prizes now by clicking this link", "spam"),
    ("Get a free entry to the contest today", "spam"),
    ("Reminder: meeting scheduled for tomorrow at 10am", "ham"),
    ("Can we reschedule the project meeting?", "ham"),
    ("Please find the attached report for your review", "ham"),
    ("Exclusive offer! Win a free vacation trip", "spam"),
    ("Your account update request was received", "ham"),
    ("Don't miss this chance to win cash rewards", "spam"),
    ("Let's catch up for lunch tomorrow", "ham")
]

# 2. Vocabulary (common spam/ham words)
vocabulary = ["free", "win", "cash", "prize", "offer",
              "meeting", "project", "report", "account", "lunch"]

# so it always return a vector of length 10 (number of words in vocabulary)
def text_to_vector(text, vocabulary):
    """Convert an email into a binary feature vector."""
    text = text.lower()
    text = text.translate(str.maketrans("", "", str.punctuation))
    words = text.split()
    return [1 if word in words else 0 for word in vocabulary]


# ===============================
# 3. Train the Naive Bayes model
# ===============================
def train_naive_bayes(dataset):
    """
    Train a Naive Bayes classifier.
    Input: dataset = [(vector, label), ...]
    Output: priors, cond_probs
    """
    number_of_words = len(vocabulary)
    spam_emails_count = 0
    ham_emails_count = 0
    word_count_in_spam_emails = [0] * number_of_words
    word_count_in_ham_emails = [0] * number_of_words
    # TODO: count samples per class
    # TODO: count occurrences of each feature given each class
    for vector, label in dataset:
      if label == "spam":
        spam_emails_count += 1
        for i in range(number_of_words):
          word_count_in_spam_emails[i] += vector[i]
      else:
        ham_emails_count += 1
        for i in range(number_of_words):
          word_count_in_ham_emails[i] += vector[i]

    # TODO: compute priors and conditional probabilities (with Laplace smoothing)
    priors = {}
    priors["spam"] = spam_emails_count / len(dataset)
    priors["ham"] = ham_emails_count / len(dataset)

    spam_probs = []
    ham_probs = []
    for i in range(number_of_words):
      ## +1 in nominator and +2 in denominator is called Laplace smoothing
      ## which helps us not to get 0 in the denominator
      spam_probs.append((word_count_in_spam_emails[i]  + 1) / (spam_emails_count + 2) )
      ham_probs.append((word_count_in_ham_emails[i] + 1)  / (ham_emails_count + 2) )

    cond_probs = {}
    cond_probs["spam"] = spam_probs
    cond_probs["ham"] = ham_probs
    return priors, cond_probs
    pass


# ===============================
# 4. Prediction function
# ===============================
def predict(vector, priors, cond_probs):
    """
    Predict the class of a new email.
    """
    # TODO: implement probability computation using Naive Bayes formula
    spam_score = priors["spam"]
    ham_score = priors["ham"]
    for i in range(len(vector)):
      if vector[i] == 1:
        spam_score *= cond_probs["spam"][i]
        ham_score *= cond_probs["ham"][i]
      else:
        spam_score *= (1 - cond_probs["spam"][i])
        ham_score *= (1 - cond_probs["ham"][i])

    if spam_score > ham_score:
      return "spam", spam_score
    return "ham", ham_score
    pass


# ===============================
# 5. Main program
# ===============================
def main():
    dataset = [(text_to_vector(text, vocabulary), label) for text, label in emails]

    # Train model
    priors, cond_probs = train_naive_bayes(dataset)

    # Display learned probabilities
    print("Vocabulary:", vocabulary)
    print("Priors:", priors)
    print("Conditional probabilities:")
    for c, probs in cond_probs.items():
        print(f"  {c}: {probs}")

    # Test data
    test_emails = [
        ("Win a free cash prize now", "spam"),
        ("Meeting about the project report tomorrow", "ham"),
        ("Your free offer expires soon", "spam"),
        ("Let's schedule lunch next week", "ham"),
        # some additional cases
        ("Exclusive cash offer, click to win", "spam"),
        ("Attached is the project report for your review", "ham")
    ]

    # Predict
    print("\nPredictions:")
    for t in test_emails:
        vec = text_to_vector(t[0], vocabulary)
        label, score = predict(vec, priors, cond_probs)
        print(f"Email: '{t}'\n  Predicted: {label}, Scores: {score}\n")

if __name__ == "__main__":
    main()


Vocabulary: ['free', 'win', 'cash', 'prize', 'offer', 'meeting', 'project', 'report', 'account', 'lunch']
Priors: {'spam': 0.5, 'ham': 0.5}
Conditional probabilities:
  spam: [0.5714285714285714, 0.5714285714285714, 0.42857142857142855, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285]
  ham: [0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857]

Predictions:
Email: '('Win a free cash prize now', 'spam')'
  Predicted: spam, Scores: 0.003964042881505701

Email: '('Meeting about the project report tomorrow', 'ham')'
  Predicted: ham, Scores: 0.0027528075566011813

Email: '('Your free offer expires soon', 'spam')'
  Predicted: spam, Scores: 0.003964042881505701

Email: '("Let's schedule lunch next week", 'ham')'
  Predicted: ham, Scores: 0.0172

What we need to do here is to classify a new email to be spam or ham. so we want P(spam|email) or P(ham|email) and we can achieve that by calculating P(word[i]|class) and multiply them together with P(class) and see the highest P(spam|email) or P(ham|email).