In [None]:
# setting things up
import pandas as pd
from math import log
import time
from prettytable import PrettyTable

In [None]:
# function for Multinomial Naive Bayes
def multinomial_naive_bayes(data: pd.DataFrame, i: int, k: int, x: int):
    
    start_time = time.time()
    n_rows = data.shape[0]
    size_of_each_fold = n_rows//k
    test = data.iloc[i*size_of_each_fold: (i+1)*size_of_each_fold, :]
    train = data.drop(range(i*size_of_each_fold, (i+1)*size_of_each_fold))

    # Initial probability for both spam and ham
    pi = {
        0: 0,
        1: 0
    }
    for index, row in train.iterrows():
        if row[0] == 1:
            pi[1] += 1
        else:
            pi[0] += 1

    for key in pi:
        pi[key] /= train.shape[0]

    pxc = {} # stores probability of each word in spam and normal messages

    count_ham = 0
    count_spam = 0

    # Iterating in training dataset to calculate P(word / Class) 
    for index, row in train.iterrows():
        words = row[1].split(" ")
        if row[0] == 0:
            count_ham += len(words)
        else:
            count_spam += len(words)
        for word in words:
            if word in pxc:
                if row[0] == 0:
                    pxc[word] = (pxc[word][0] + 1, pxc[word][1])
                else:
                    pxc[word] = (pxc[word][0], pxc[word][1] + 1)
            else:
                if row[0] == 0:
                    pxc[word] = (1, 0)
                else:
                    pxc[word] = (0, 1)

    for key in pxc:
        # Add-one Laplace Smoothning for Multinomial
        pxc[key] = ((pxc[key][0] + 1)/(count_ham + len(pxc)), (pxc[key][1] + 1)/(count_spam + len(pxc)))
    end_time = time.time()

    tn = tp = fn = fp = 0
    # Testing on the parameters estimated
    for index, row in test.iterrows():
        words = row[1].split(" ")
        prob_ham = log(pi[0])
        prob_spam = log(pi[1])

        for word in words:
            if word not in pxc:
                continue
            prob_spam += log(pxc[word][1])
            prob_ham += log(pxc[word][0])


        if (prob_spam >= prob_ham) and row[0] == 1:
            tp += 1
        elif (prob_spam < prob_ham) and row[0] == 0:
            tn += 1
        elif (prob_spam >= prob_ham) and row[0] == 0:
            fp += 1
        elif (prob_spam < prob_ham) and row[0] == 1:
            fn += 1    
    time_taken = end_time - start_time
    if x == 0:
        correct_predictions_spam = (tp) / (tp + fn)
        correct_predictions_ham = (tn) / (tn + fp)
        correct_predictions = (tp + tn) / (tp + fp + tn + fn)
        return (correct_predictions_spam, correct_predictions_ham, correct_predictions, time_taken)
    else:
        precision = (tp) / (tp + fp)
        recall = (tp) / (tp + fn)
        return (precision, recall, time_taken)

In [None]:
# function for Multivariate Naive Bayes
def multivariate_naive_bayes(data: pd.DataFrame, i: int, k: int, x: int):

    start_time = time.time()
    n_rows = data.shape[0]
    size_of_each_fold = n_rows//k
    test = data.iloc[i*size_of_each_fold: (i+1)*size_of_each_fold, :]
    train = data.drop(range(i*size_of_each_fold, (i+1)*size_of_each_fold))

    # Initial probability for both spam and ham
    pi = {
        0: 0,
        1: 0
    }
    for index, row in train.iterrows():
        if row[0] == 1:
            pi[1] += 1
        else:
            pi[0] += 1

    for key in pi:
        pi[key] /= train.shape[0]

    pxc = {} # probability this word is spam given word occurs
    count_ham = 0
    count_spam = 0

    # Iterating in training dataset to calculate P(word / Class) 
    for index, row in train.iterrows():
        words = row[1].split(" ")
        if row[0] == 0:
            count_ham += 1
        else:
            count_spam += 1
        for word in set(words):
            if word in pxc:
                if row[0] == 0:
                    pxc[word] = (pxc[word][0] + 1, pxc[word][1])
                else:
                    pxc[word] = (pxc[word][0], pxc[word][1] + 1)
            else:
                if row[0] == 0:
                    pxc[word] = (1, 0)
                else:
                    pxc[word] = (0, 1)

    for key in pxc:
        # Add-one Laplace Smoothning for Multivariate
        pxc[key] = ((pxc[key][0] + 1)/(count_ham + 2), (pxc[key][1] + 1)/(count_spam + 2))
    end_time = time.time()

    tn = tp = fn = fp = 0
    # Testing on the parameters estimated
    for index, row in test.iterrows():
        words = row[1].split(" ")
        prob_ham = log(pi[0])
        prob_spam = log(pi[1])
        
        for word in set(words):
            if word not in pxc:
                continue
            prob_spam += log(pxc[word][1])
            prob_ham += log(pxc[word][0])
            
        if (prob_spam >= prob_ham) and row[0] == 1:
            tp += 1
        elif (prob_spam < prob_ham) and row[0] == 0:
            tn += 1
        elif (prob_spam >= prob_ham) and row[0] == 0:
            fp += 1
        elif (prob_spam < prob_ham) and row[0] == 1:
            fn += 1
    time_taken = end_time - start_time
    if x == 0:
        correct_predictions_spam = (tp) / (tp + fn)
        correct_predictions_ham = (tn) / (tn + fp)
        correct_predictions = (tp + tn) / (tp + fp + tn + fn)
        return (correct_predictions_spam, correct_predictions_ham, correct_predictions, time_taken)
    else:
        precision = (tp) / (tp + fp)
        recall = (tp) / (tp + fn)
        return (precision, recall, time_taken)

In [None]:
# main function
def main():

    # reading data
    data = pd.read_csv("SMSSpamCollection", sep="\t", header=None)
    # marking all spam sms as true and ham as false
    data.iloc[:, 0] = data.iloc[:, 0] == "spam"
    # print(data.head())

    # performing 5 fold cross validation and calculating overall and category wise accuracy
    k = 5
    accuracy_spam_MN = accuracy_ham_MN = accuracy_MN = accuracy_spam_MV = accuracy_ham_MV = accuracy_MV = time_MN = time_MV = 0
    for i in range (0, k):
        tempx = tempy = tempz = tempt = 0
        (tempx, tempy, tempz, tempt) = multinomial_naive_bayes(data, i, k, 0)
        accuracy_spam_MN +=tempx
        accuracy_ham_MN += tempy
        accuracy_MN += tempz
        time_MN += tempt
        tempx = tempy = tempz = tempt = 0
        (tempx, tempy, tempz, tempt) = multivariate_naive_bayes(data, i, k, 0)
        accuracy_spam_MV +=tempx
        accuracy_ham_MV += tempy
        accuracy_MV += tempz
        time_MV += tempt

    # calculating results in ter,s of category wise accuracies and overall accuracies
    accuracy_spam_MN /= k
    accuracy_ham_MN /= k
    accuracy_MN /= k
    accuracy_spam_MV /= k
    accuracy_ham_MV /= k
    accuracy_MV /= k
    time_MN /= k
    time_MV /= k

    print("RESULTS FOR 5 FOLD CROSS VALIDATION :")
    # making table
    Table1 = PrettyTable(["Result", "Value"])
    Table1.add_row(["Accuracy_SPAM_MN", accuracy_spam_MN])
    Table1.add_row(["Accuracy_HAM_MN", accuracy_ham_MN])
    Table1.add_row(["Accuracy_MN", accuracy_MN])
    Table1.add_row(["AvgTime_MN", time_MN])
    Table1.add_row([" ", " "])
    Table1.add_row(["Accuracy_SPAM_MV", accuracy_spam_MV])
    Table1.add_row(["Accuracy_HAM_MV", accuracy_ham_MV])
    Table1.add_row(["Accuracy_MV", accuracy_MV])
    Table1.add_row(["AvgTime_MV", time_MV])
    print(Table1)

    # performing 10 fold cross validation and calculating precision, recall and f score
    k = 10
    precision_MN = precision_MV = recall_MN = recall_MV = fscore_MN = fscore_MV = time_MN = time_MV = 0
    for i in range (0, k):
        tempx = tempy = tempt = 0
        (tempx, tempy, tempt) = multinomial_naive_bayes(data, i, k, 1)
        precision_MN += tempx
        recall_MN += tempy
        time_MN += tempt
        tempx = tempy = tempt = 0
        (tempx, tempy, tempt) = multivariate_naive_bayes(data, i, k, 1)
        precision_MV += tempx
        recall_MV += tempy
        time_MV += tempt

    # calculating results in ter,s of category wise accuracies and overall accuracies
    precision_MN /= k
    recall_MN /= k
    fscore_MN = (2 * precision_MN * recall_MN) / (precision_MN + recall_MN);
    precision_MV /= k
    recall_MV /= k
    fscore_MV = (2 * precision_MV * recall_MV) / (precision_MV + recall_MV);
    time_MN /= k
    time_MV /= k

    print("RESULTS FOR 10 FOLD CROSS VALIDATION :")
    # making table
    Table2 = PrettyTable(["Result", "Value"])
    Table2.add_row(["Precision_MN", precision_MN])
    Table2.add_row(["Recall_MN", recall_MN])
    Table2.add_row(["F1_Score_MN", fscore_MN])
    #Table2.add_row(["AvgTime_MN", time_MN])
    Table2.add_row([" ", " "])
    Table2.add_row(["Precision_MV", precision_MV])
    Table2.add_row(["Recall_MV", recall_MV])
    Table2.add_row(["F1_Score_MV", fscore_MV])
    #Table2.add_row(["AvgTime_MV", time_MV])
    print(Table2)

if __name__ == "__main__":
    main()

RESULTS FOR 5 FOLD CROSS VALIDATION :
+------------------+--------------------+
|      Result      |       Value        |
+------------------+--------------------+
| Accuracy_SPAM_MN | 0.8785414779439339 |
| Accuracy_HAM_MN  | 0.9964741479599557 |
|   Accuracy_MN    | 0.9807899461400359 |
|    AvgTime_MN    | 0.7919591903686524 |
|                  |                    |
| Accuracy_SPAM_MV | 0.9838122620730223 |
| Accuracy_HAM_MV  | 0.8150038412844476 |
|   Accuracy_MV    |  0.83770197486535  |
|    AvgTime_MV    | 0.7794358253479003 |
+------------------+--------------------+
RESULTS FOR 10 FOLD CROSS VALIDATION :
+--------------+--------------------+
|    Result    |       Value        |
+--------------+--------------------+
| Precision_MN | 0.9766815870417277 |
|  Recall_MN   | 0.8935313426759317 |
| F1_Score_MN  | 0.9332580220884295 |
|              |                    |
| Precision_MV | 0.4714178530548582 |
|  Recall_MV   | 0.9865940998403037 |
| F1_Score_MV  | 0.6379893819934263