In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

# Training data
spam_messages = ["Gana dinero rápido", "Reclama tu premio"]
ham_messages = ["Reunión a las 3PM", "Actualización del proyecto necesaria"]

# Combine all messages for vectorization
all_messages = spam_messages + ham_messages
labels = ['spam'] * len(spam_messages) + ['ham'] * len(ham_messages)

# Vectorize the messages to get bigrams
print("Vectorizing messages to extract bigrams...\n")
vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r"(?u)\b\w+\b")
X = vectorizer.fit_transform(all_messages)
bigrams = vectorizer.get_feature_names_out()
print(f"Extracted Bigrams: {bigrams}\n")

# Initialize transition count tables
transition_counts_spam = defaultdict(lambda: defaultdict(int))
transition_counts_ham = defaultdict(lambda: defaultdict(int))

# Populate transition counts with explanations
print("Populating transition counts for spam and ham...\n")
for i, message in enumerate(all_messages):
    words = message.split()
    print(f"Message: '{message}'")
    print(f"Words: {words}")
    
    for j in range(len(words) - 1):
        current_word = words[j]
        next_word = words[j + 1]
        
        if labels[i] == 'spam':
            transition_counts_spam[current_word][next_word] += 1
            print(f"Spam Transition: '{current_word}' -> '{next_word}' (Count: {transition_counts_spam[current_word][next_word]})")
        else:
            transition_counts_ham[current_word][next_word] += 1
            print(f"Ham Transition: '{current_word}' -> '{next_word}' (Count: {transition_counts_ham[current_word][next_word]})")
    print()

# Calculate vocabulary size for smoothing
vocab = set(word for msg in all_messages for word in msg.split())
vocab_size = len(vocab)
print(f"Vocabulary Size (Unique Words): {vocab_size}\n")

# Convert counts to probabilities with Laplace smoothing
transition_probs_spam = defaultdict(lambda: defaultdict(float))
transition_probs_ham = defaultdict(lambda: defaultdict(float))

print("Calculating transition probabilities with Laplace smoothing...\n")
for word, next_words in transition_counts_spam.items():
    total_count = sum(next_words.values()) + vocab_size
    print(f"Total outgoing transitions from '{word}' (Spam): {total_count}")
    for next_word, count in next_words.items():
        transition_probs_spam[word][next_word] = (count + 1) / total_count
        print(f"P({next_word} | {word}, Spam) = ({count} + 1) / {total_count} = {transition_probs_spam[word][next_word]}")
    print()

for word, next_words in transition_counts_ham.items():
    total_count = sum(next_words.values()) + vocab_size
    print(f"Total outgoing transitions from '{word}' (Ham): {total_count}")
    for next_word, count in next_words.items():
        transition_probs_ham[word][next_word] = (count + 1) / total_count
        print(f"P({next_word} | {word}, Ham) = ({count} + 1) / {total_count} = {transition_probs_ham[word][next_word]}")
    print()

# Step 2: Define the message prediction function
def predict_markov(message, spam_probs, ham_probs, vocab_size, p_spam=0.5, p_ham=0.5):
    words = message.split()
    spam_likelihood = p_spam
    ham_likelihood = p_ham
    print("\nCalculating message probabilities...\n")
    
    for i in range(len(words) - 1):
        current_word = words[i]
        next_word = words[i + 1]
        
        # Spam transition
        spam_prob = spam_probs[current_word].get(next_word, 1 / (vocab_size + 1))
        ham_prob = ham_probs[current_word].get(next_word, 1 / (vocab_size + 1))
        
        print(f"Transition '{current_word}' -> '{next_word}':")
        print(f"  P({next_word} | {current_word}, Spam) = {spam_prob}")
        print(f"  P({next_word} | {current_word}, Ham) = {ham_prob}")
        
        spam_likelihood *= spam_prob
        ham_likelihood *= ham_prob
        
        print(f"  Updated Spam Likelihood: {spam_likelihood}")
        print(f"  Updated Ham Likelihood: {ham_likelihood}\n")
    
    print(f"Final Spam Likelihood: {spam_likelihood}")
    print(f"Final Ham Likelihood: {ham_likelihood}\n")
    
    return 'spam' if spam_likelihood > ham_likelihood else 'ham', spam_likelihood, ham_likelihood

# Test message
test_message = "Reclama tu dinero premio"
prediction, spam_score, ham_score = predict_markov(test_message, transition_probs_spam, transition_probs_ham, vocab_size)

# Prepare results for display
results_df = pd.DataFrame({
    "Message": [test_message],
    "Spam Score": [spam_score],
    "Ham Score": [ham_score],
    "Prediction": [prediction]
})

display("Markov Model Spam Filter Results with Steps", results_df)

Vectorizing messages to extract bigrams...

Extracted Bigrams: ['a las' 'actualización del' 'del proyecto' 'dinero rápido' 'gana dinero'
 'las 3pm' 'proyecto necesaria' 'reclama tu' 'reunión a' 'tu premio']

Populating transition counts for spam and ham...

Message: 'Gana dinero rápido'
Words: ['Gana', 'dinero', 'rápido']
Spam Transition: 'Gana' -> 'dinero' (Count: 1)
Spam Transition: 'dinero' -> 'rápido' (Count: 1)

Message: 'Reclama tu premio'
Words: ['Reclama', 'tu', 'premio']
Spam Transition: 'Reclama' -> 'tu' (Count: 1)
Spam Transition: 'tu' -> 'premio' (Count: 1)

Message: 'Reunión a las 3PM'
Words: ['Reunión', 'a', 'las', '3PM']
Ham Transition: 'Reunión' -> 'a' (Count: 1)
Ham Transition: 'a' -> 'las' (Count: 1)
Ham Transition: 'las' -> '3PM' (Count: 1)

Message: 'Actualización del proyecto necesaria'
Words: ['Actualización', 'del', 'proyecto', 'necesaria']
Ham Transition: 'Actualización' -> 'del' (Count: 1)
Ham Transition: 'del' -> 'proyecto' (Count: 1)
Ham Transition: 'proyecto

'Markov Model Spam Filter Results with Steps'

Unnamed: 0,Message,Spam Score,Ham Score,Prediction
0,Reclama tu dinero premio,0.000296,0.000148,spam
