# Naive Bayes Spam Classification

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re

## Step 1: Data Collection
Load the labeled email dataset

In [13]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

print(f"Total emails in dataset: {len(df)}")
print(f"\nClass distribution:")
print(df['label'].value_counts())
print(f"\nFirst few emails:")
df.head()

Total emails in dataset: 5572

Class distribution:
label
ham     4825
spam     747
Name: count, dtype: int64

First few emails:


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Step 2: Feature Extraction
Process text to extract words as features

In [17]:
# Separate spam and ham emails
spam_emails = df[df['label'] == 'spam']['message'].tolist()
ham_emails = df[df['label'] == 'ham']['message'].tolist()

print(f"Spam emails: {len(spam_emails)}")
print(f"Ham emails: {len(ham_emails)}")
print(f"\nExample spam email: {spam_emails[0]}")
print(f"\nExample ham email: {ham_emails[0]}")

Spam emails: 747
Ham emails: 4825

Example spam email: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

Example ham email: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


## Step 3: Training Data Preparation
Count word occurrences in each class

In [28]:
# Initialize word count dictionaries and vocabulary set
spam_word_count = defaultdict(int)
ham_word_count = defaultdict(int)
vocabulary = set()

# Process spam emails
for email in spam_emails:
    text = email.lower()
    # Remove single letters and numbers
    text = re.sub(r'\b[a-z]\b|\d+', '', text)
    # Remove non-alphabetic characters except spaces
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    # Filter words with length > 1
    words = [word for word in words if len(word) > 1]
    
    for word in words:
        spam_word_count[word] += 1
        vocabulary.add(word)

print(f"Total words in spam emails: {sum(spam_word_count.values())}")
print(f"Unique words in spam: {len(spam_word_count)}")

Total words in spam emails: 14880
Unique words in spam: 2215


In [29]:
# Process ham emails
for email in ham_emails:
    text = email.lower()
    # Remove single letters and numbers
    text = re.sub(r'\b[a-z]\b|\d+', '', text)
    # Remove non-alphabetic characters except spaces
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    # Filter words with length > 1
    words = [word for word in words if len(word) > 1]
    
    for word in words:
        ham_word_count[word] += 1
        vocabulary.add(word)

print(f"Total words in ham emails: {sum(ham_word_count.values())}")
print(f"Unique words in ham: {len(ham_word_count)}")
print(f"\nTotal vocabulary size: {len(vocabulary)}")

Total words in ham emails: 61011
Unique words in ham: 7118

Total vocabulary size: 8321


## Step 4: Calculating Probabilities
### a) Prior Probabilities

In [33]:
# Calculate prior probabilities
total_emails = len(spam_emails) + len(ham_emails)
prior_spam = len(spam_emails) / total_emails
prior_ham = len(ham_emails) / total_emails

print("="*50)
print("PRIOR PROBABILITIES")
print("="*50)
print(f"P(Spam) = {len(spam_emails)}/{total_emails} = {prior_spam:.4f}")
print(f"P(Ham) = {len(ham_emails)}/{total_emails} = {prior_ham:.4f}")
print("="*50)

PRIOR PROBABILITIES
P(Spam) = 747/5572 = 0.1341
P(Ham) = 4825/5572 = 0.8659


### b) Likelihood Probabilities with Laplace Smoothing

In [37]:
# Calculate total word counts
total_spam_words = sum(spam_word_count.values())
total_ham_words = sum(ham_word_count.values())
vocab_size = len(vocabulary)

print(f"Total words in spam: {total_spam_words}")
print(f"Total words in ham: {total_ham_words}")
print(f"Vocabulary size: {vocab_size}")

Total words in spam: 14880
Total words in ham: 61011
Vocabulary size: 8321


In [54]:
# Example: Calculate likelihood for some common spam words
example_words = ['free', 'win', 'click', 'now', 'buy']

print("\nExample Likelihood Probabilities (with Laplace smoothing):")
print("="*70)
print(f"{'Word':<15} {'P(word|Spam)':<20} {'P(word|Ham)':<20}")
print("="*70)

for word in example_words:
    # With Laplace smoothing: (count + 1) / (total_words + vocab_size)
    spam_count = spam_word_count.get(word, 0)
    ham_count = ham_word_count.get(word, 0)
    
    p_word_spam = (spam_count + 1) / (total_spam_words + vocab_size)
    p_word_ham = (ham_count + 1) / (total_ham_words + vocab_size)
    
    print(f"{word:<15} {p_word_spam:<20.8f} {p_word_ham:<20.8f}")


Example Likelihood Probabilities (with Laplace smoothing):
Word            P(word|Spam)         P(word|Ham)         
free            0.00948235           0.00086540          
win             0.00267230           0.00017308          
click           0.00025861           0.00004327          
now             0.00823240           0.00421162          
buy             0.00017241           0.00090867          


## Step 5: Storing the Model
Store all the trained parameters

In [69]:
# Store model parameters
model = {
    'prior_spam': prior_spam,
    'prior_ham': prior_ham,
    'spam_word_count': spam_word_count,
    'ham_word_count': ham_word_count,
    'vocabulary': vocabulary,
    'total_spam_words': total_spam_words,
    'total_ham_words': total_ham_words,
    'vocab_size': vocab_size
}

print("="*50)
print("MODEL TRAINED AND STORED SUCCESSFULLY!")
print("="*50)
print(f"Prior P(Spam): {prior_spam:.4f}")
print(f"Prior P(Ham): {prior_ham:.4f}")
print(f"Vocabulary size: {vocab_size}")
print(f"Total spam words: {total_spam_words}")
print(f"Total ham words: {total_ham_words}")
print("="*50)

MODEL TRAINED AND STORED SUCCESSFULLY!
Prior P(Spam): 0.1341
Prior P(Ham): 0.8659
Vocabulary size: 8321
Total spam words: 14880
Total ham words: 61011


## Step 6: Classification
Classify new emails using the trained model

In [70]:
# Test Email 1: Obvious spam
test_email_1 = "Congratulations! You've won a free iPhone! Click here now!"

print("Testing Email 1:")
print(f"Message: {test_email_1}")
print("\n" + "="*70)

# Process the email
text = test_email_1.lower()
text = re.sub(r'\b[a-z]\b|\d+', '', text)
text = re.sub(r'[^a-z\s]', '', text)
words = text.split()
words = [word for word in words if len(word) > 1]

print(f"Extracted words: {words}")
print("="*70)

# Calculate log probabilities (to avoid underflow)
log_prob_spam = np.log(prior_spam)
log_prob_ham = np.log(prior_ham)

print(f"\nStarting with priors:")
print(f"log P(Spam) = {log_prob_spam:.4f}")
print(f"log P(Ham) = {log_prob_ham:.4f}")
print("\nMultiplying by word likelihoods:")

for word in words:
    # Calculate P(word|Spam) with Laplace smoothing
    spam_count = spam_word_count.get(word, 0)
    p_word_spam = (spam_count + 1) / (total_spam_words + vocab_size)
    
    # Calculate P(word|Ham) with Laplace smoothing
    ham_count = ham_word_count.get(word, 0)
    p_word_ham = (ham_count + 1) / (total_ham_words + vocab_size)
    
    log_prob_spam += np.log(p_word_spam)
    log_prob_ham += np.log(p_word_ham)
    
    print(f"  {word}: P(word|Spam)={p_word_spam:.6f}, P(word|Ham)={p_word_ham:.6f}")

print("\n" + "="*70)
print(f"Final log P(Spam|email) = {log_prob_spam:.4f}")
print(f"Final log P(Ham|email) = {log_prob_ham:.4f}")

if log_prob_spam > log_prob_ham:
    classification = "SPAM"
else:
    classification = "HAM"

print(f"\n>>> Classification: {classification} <<<")
print("="*70)

Testing Email 1:
Message: Congratulations! You've won a free iPhone! Click here now!

Extracted words: ['congratulations', 'youve', 'won', 'free', 'iphone', 'click', 'here', 'now']

Starting with priors:
log P(Spam) = -2.0094
log P(Ham) = -0.1439

Multiplying by word likelihoods:
  congratulations: P(word|Spam)=0.000647, P(word|Ham)=0.000029
  youve: P(word|Spam)=0.000216, P(word|Ham)=0.000159
  won: P(word|Spam)=0.003319, P(word|Ham)=0.000288
  free: P(word|Spam)=0.009482, P(word|Ham)=0.000865
  iphone: P(word|Spam)=0.000043, P(word|Ham)=0.000029
  click: P(word|Spam)=0.000259, P(word|Ham)=0.000043
  here: P(word|Spam)=0.000302, P(word|Ham)=0.001615
  now: P(word|Spam)=0.008232, P(word|Ham)=0.004212

Final log P(Spam|email) = -59.3802
Final log P(Ham|email) = -66.9491

>>> Classification: SPAM <<<


In [71]:
# Test Email 2: Legitimate message
test_email_2 = "Hey, do you want to meet for coffee tomorrow afternoon?"

print("Testing Email 2:")
print(f"Message: {test_email_2}")
print("\n" + "="*70)

# Process the email
text = test_email_2.lower()
text = re.sub(r'\b[a-z]\b|\d+', '', text)
text = re.sub(r'[^a-z\s]', '', text)
words = text.split()
words = [word for word in words if len(word) > 1]

print(f"Extracted words: {words}")
print("="*70)

# Calculate log probabilities
log_prob_spam = np.log(prior_spam)
log_prob_ham = np.log(prior_ham)

print(f"\nStarting with priors:")
print(f"log P(Spam) = {log_prob_spam:.4f}")
print(f"log P(Ham) = {log_prob_ham:.4f}")
print("\nMultiplying by word likelihoods:")

for word in words:
    # Calculate P(word|Spam) with Laplace smoothing
    spam_count = spam_word_count.get(word, 0)
    p_word_spam = (spam_count + 1) / (total_spam_words + vocab_size)
    
    # Calculate P(word|Ham) with Laplace smoothing
    ham_count = ham_word_count.get(word, 0)
    p_word_ham = (ham_count + 1) / (total_ham_words + vocab_size)
    
    log_prob_spam += np.log(p_word_spam)
    log_prob_ham += np.log(p_word_ham)
    
    print(f"  {word}: P(word|Spam)={p_word_spam:.6f}, P(word|Ham)={p_word_ham:.6f}")

print("\n" + "="*70)
print(f"Final log P(Spam|email) = {log_prob_spam:.4f}")
print(f"Final log P(Ham|email) = {log_prob_ham:.4f}")

if log_prob_spam > log_prob_ham:
    classification = "SPAM"
else:
    classification = "HAM"

print(f"\n>>> Classification: {classification} <<<")
print("="*70)

Testing Email 2:
Message: Hey, do you want to meet for coffee tomorrow afternoon?

Extracted words: ['hey', 'do', 'you', 'want', 'to', 'meet', 'for', 'coffee', 'tomorrow', 'afternoon']

Starting with priors:
log P(Spam) = -2.0094
log P(Ham) = -0.1439

Multiplying by word likelihoods:
  hey: P(word|Spam)=0.000259, P(word|Ham)=0.001543
  do: P(word|Spam)=0.001034, P(word|Ham)=0.005466
  you: P(word|Spam)=0.012413, P(word|Ham)=0.026640
  want: P(word|Spam)=0.001293, P(word|Ham)=0.002365
  to: P(word|Spam)=0.029611, P(word|Ham)=0.022428
  meet: P(word|Spam)=0.000302, P(word|Ham)=0.001053
  for: P(word|Spam)=0.008836, P(word|Ham)=0.007241
  coffee: P(word|Spam)=0.000043, P(word|Ham)=0.000115
  tomorrow: P(word|Spam)=0.000474, P(word|Ham)=0.001168
  afternoon: P(word|Spam)=0.000043, P(word|Ham)=0.000404

Final log P(Spam|email) = -72.2958
Final log P(Ham|email) = -60.7146

>>> Classification: HAM <<<


In [72]:
# Test Email 3: Another example
test_email_3 = "Quick cash opportunity"

print("Testing Email 3:")
print(f"Message: {test_email_3}")
print("\n" + "="*70)

# Process the email
text = test_email_3.lower()
text = re.sub(r'\b[a-z]\b|\d+', '', text)
text = re.sub(r'[^a-z\s]', '', text)
words = text.split()
words = [word for word in words if len(word) > 1]

print(f"Extracted words: {words}")
print("="*70)

# Calculate log probabilities
log_prob_spam = np.log(prior_spam)
log_prob_ham = np.log(prior_ham)

print(f"\nStarting with priors:")
print(f"log P(Spam) = {log_prob_spam:.4f}")
print(f"log P(Ham) = {log_prob_ham:.4f}")
print("\nMultiplying by word likelihoods:")

for word in words:
    spam_count = spam_word_count.get(word, 0)
    p_word_spam = (spam_count + 1) / (total_spam_words + vocab_size)
    
    ham_count = ham_word_count.get(word, 0)
    p_word_ham = (ham_count + 1) / (total_ham_words + vocab_size)
    
    log_prob_spam += np.log(p_word_spam)
    log_prob_ham += np.log(p_word_ham)
    
    print(f"  {word}: P(word|Spam)={p_word_spam:.6f}, P(word|Ham)={p_word_ham:.6f}")

print("\n" + "="*70)
print(f"Final log P(Spam|email) = {log_prob_spam:.4f}")
print(f"Final log P(Ham|email) = {log_prob_ham:.4f}")

if log_prob_spam > log_prob_ham:
    classification = "SPAM"
else:
    classification = "HAM"

print(f"\n>>> Classification: {classification} <<<")
print("="*70)

Testing Email 3:
Message: Quick cash opportunity

Extracted words: ['quick', 'cash', 'opportunity']

Starting with priors:
log P(Spam) = -2.0094
log P(Ham) = -0.1439

Multiplying by word likelihoods:
  quick: P(word|Spam)=0.000043, P(word|Ham)=0.000130
  cash: P(word|Spam)=0.002715, P(word|Ham)=0.000188
  opportunity: P(word|Spam)=0.000043, P(word|Ham)=0.000058

Final log P(Spam|email) = -28.0222
Final log P(Ham|email) = -27.4355

>>> Classification: HAM <<<
