In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from functools import reduce
from itertools import chain
import operator

In [2]:
messages = pd.read_csv('messages.csv',
                       encoding='latin-1',
                       usecols=['v2', 'v1'])
messages.columns = ['Report', 'Message']
messages.head()

Unnamed: 0,Report,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Splitting Training and Testing data
train, test = train_test_split(messages, test_size=0.2)

spam_emails = train[train['Report'] == 'spam']
ham_emails = train[train['Report'] == 'ham']

spam_emails_text = " ".join(
    [message.lower() for message in spam_emails['Message']])
ham_emails_text = " ".join(
    [message.lower() for message in ham_emails['Message']])

spam_emails_set = set(spam_emails)
ham_emails_set = set(ham_emails)

unique_words = set((" ".join(train['Message']).lower().split()))
total_unique_words = len(unique_words)

# calculating probability of spam and ham messages
probability_spam_mail = len(spam_emails) / len(train)
probability_ham_mail = len(ham_emails) / len(train)

In [4]:
def is_spam(message) -> str:
    """
        check each message for spam using Naive Bayes
    """
    message = message.lower()

    spam_probability = 1
    # probability that word contribute to spam
    probability_spam = dict()
    # probability that word contribute to ham
    probability_ham = dict()

    spam_denominator = len(spam_emails_text.split(" ")) + total_unique_words
    ham_denominator = len(ham_emails_text.split(" ")) + total_unique_words

    # addition of 1 is to avoid zero multiplication
    for word in message.split():
        probability_spam[word] = (spam_emails_text.count(word) +
                                  1) / spam_denominator
        probability_spam[word] **= message.count(word)

        probability_ham[word] = (ham_emails_text.count(word) +
                                 1) / ham_denominator
        probability_ham[word] **= message.count(word)

    probability_spam[""] = probability_spam_mail
    probability_ham[""] = probability_ham_mail

    #     spam_probability_num = reduce(operator.mul, probability_spam.values())
    spam_probability_num = reduce(lambda a, b: a * b,
                                  probability_spam.values())
    #     spam_probability_den = reduce(operator.mul, probability_ham.values())
    spam_probability_den = reduce(lambda a, b: a * b, probability_ham.values())

    if spam_probability_num == 0 or spam_probability_den == 0:
        return 'ham'

    spam_probability = spam_probability_num / \
        (spam_probability_num + spam_probability_den)

    return 'spam' if spam_probability >= 0.5 else 'ham'

In [5]:
# # calculating probability of each word for testing set
correct_output = 0

for test_email in test.itertuples():
    prediction = is_spam(test_email.Message)
    if test_email.Report == prediction:
        correct_output += 1

    else:
        prediction += " <------"

    print(f"Actual: {test_email.Report}, Predicted: {prediction}")

accuracy = round((correct_output / len(test)) * 100, 2)
print(f"Total = {len(test)}, Correct Prediction = {correct_output}, "
      f"Incorrect Prediction = {len(test) - correct_output}")
print(f"Accuracy = {accuracy}%")

Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ha

Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ha

Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: spam <------
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual:

Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: spam, Predicted: spam
Actual: ham, Predicted: ham
Actual: ham, Predicted: ham
Actual: ham,