In [339]:
import re
import string
from random import randrange

In [340]:
# Some global variables which are used across functions
probwordinspam = {}
probwordinnonspam = {}
numspam = 0
numnonspam = 0
prob_spam = 0
prob_non_spam = 0
unique_words = []

In [341]:
#returns P(word|Spam)
def prob_word_in_spam(word,data):
    count = 0
    for line in data:
        text = line[0]
        if line[1] == '1':
            if word in line[0]:
                count = count +1
    #Laplace Smoothening
    return (count+1)/(numspam+2)

In [342]:
#returns P(word|NonSpam)
def prob_word_in_nonspam(word,data):
    count = 0
    for line in data:
        text = line[0]
        if line[1] == '0':
            if word in line[0]:
                count = count +1
    #Laplace Smoothening
    return (count+1)/(numnonspam+2)

In [343]:
#Splits the data received into k folds and returns the splits
def k_fold_split(data, nfolds):
    data1=data[:]
    splits =[]
    size = int(len(data)/nfolds)
    for k in range(nfolds):
        fold =[]
        while len(fold) < size:
            index = randrange(len(data1))
            fold.append(data1.pop(index))
        splits.append(fold)
    #n Data Points which are left after initial creation of folds are alloted to n initial folds
    remainder = len(data) % nfolds
    for i in range(remainder):
        index = randrange(len(data1))
        splits[i].append(data1.pop(index))
    return splits

In [344]:
#Given a list of emails, extracts out the unique words and returns them as a list
def get_unique_words(data):
    uwords = set()
    for line in data:
        t = line[0]
        for words in t:
            uwords.add(words)
    luwords=list(uwords)
    luwords.sort()
    return luwords

In [345]:
def k_fold_train_test(data,nfolds):
    splits = k_fold_split(data,nfolds)
    acc = []
    
    #Goes over the combinations of folds in sequence
    for i, split in enumerate(splits):
        trainset = splits[:]
        trainset.remove(split)
        trainset = sum(trainset, [])
        testset = split[:]
        
        #Training the Naive Bayes CLassifier
        spam_classifier_fit(trainset)
        
        #Predicting values based on the model trained above
        predicted = spam_classifier_predict(testset)
                                            
        #Finding accuracy of the predictions done above                                    
        accuracy = check_accuracy(testset,predicted)
                                            
        print(f"Accuracy of the model over fold no. {i+1} = {accuracy}")
        acc.append(accuracy)
        
        #Resets the global values to their initital values after each runthrough of combinations of folds
        global probwordinspam ,probwordinnonspam,numspam,numnonspam,prob_spam,prob_non_spam,unique_words
        probwordinspam.clear()
        probwordinnonspam.clear()
        unique_words.clear()
        numspam = 0
        numnonspam = 0
        prob_spam = 0
        prob_non_spam = 0
    #print(acc)
    #Prints the average accuracy
    print(f"\nAverage accuracy of the model = {sum(acc)/len(acc)}")

In [346]:
#Training the Naive Bayes CLassifier
def spam_classifier_fit(data):
    global numspam, numnonspam,probwordinspam,probwordinnonspam, prob_spam, prob_non_spam, unique_words
    unique_words = get_unique_words(data)   
    
    #Finds the number of spam and non-spam emails in the given training dataset
    for line in data:
        label = line[1]
        if label == '1':
            numspam = numspam + 1
        else:
            numnonspam = numnonspam + 1
    
    #Finds P(Word|Spam), P(Word|NonSpam) and stores them in dictionary
    for word in unique_words:
        probwordinspam[word] = prob_word_in_spam(word,data)
        probwordinnonspam[word] = prob_word_in_nonspam(word,data)
    
    #Finds P(Spam) and P(NonSpam)
    prob_spam = numspam/(numspam+numnonspam)
    prob_non_spam = numnonspam/(numspam+numnonspam)

In [347]:
#Predicting values based on the model trained above
def spam_classifier_predict(data):
    probabilities = []
    for line in data:
        text = line[0]
        spamval = prob_spam
        nonspamval = prob_non_spam
        for word in text:
            if word not in unique_words:
                continue
            else:
                #Finds P(Spam|Word) and P(NonSpam|Word)
                spamval = spamval*probwordinspam[word]
                nonspamval = nonspamval*probwordinnonspam[word]
        #Finds P(Spam|word1,word2,word3....) for a specific email
        probabilityspam = spamval/(spamval+nonspamval)
        if probabilityspam < 0.5:
            probabilities.append(0)
        else:
            probabilities.append(1)
    return probabilities

In [348]:
def check_accuracy(testset,predicted):
    total = len(testset)
    #print(total)
    correct = 0
    for i, line in enumerate(testset):
        if predicted[i] == int(line[1]):
            correct = correct + 1
    return correct/total

In [349]:
with open("dataset_NB.txt", "r") as f:
    lines = f.read().splitlines()

lines_label = []
#Extracts out the text and label from the data and stores them to the list
for line in lines:
    lines_label.append((line[0:-1],line[-1:]))

In [350]:
#Splits the emails line by line into words and removes any punctuation marks, brackets, numbers etc.
linewords_label = []
for line in lines_label:
    txt = line[0]
    str = re.sub('[' + string.punctuation + ']', ' ', txt)
    str = re.sub('[\n\t\r0-9]','',str)
    words = str.split()
    linewords_label.append((words, line[1]))

In [351]:
#Hardcoded list of stopwords which may not be important to understand the context or sentiment of the email
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

#Final list of data with words and labels
data = []
for line in linewords_label:
    txt = line[0]
    str = []
    for word in txt:
        #Removes the stopwords, words of length <=2 from the list of words
        if word not in stopwords and len(word) > 2:
            #Converts the words to lower case to make the words case insensitive
            str.append(word.lower())
    data.append((str, line[1]))

In [352]:
#Calls the main Naive Bayes k-cross-validation training and testing function, with num_of_folds=7
k_fold_train_test(data,7)

Accuracy of the model over fold no. 1 = 0.8391608391608392
Accuracy of the model over fold no. 2 = 0.8251748251748252
Accuracy of the model over fold no. 3 = 0.8251748251748252
Accuracy of the model over fold no. 4 = 0.8111888111888111
Accuracy of the model over fold no. 5 = 0.7902097902097902
Accuracy of the model over fold no. 6 = 0.8251748251748252
Accuracy of the model over fold no. 7 = 0.795774647887324

Average accuracy of the model = 0.8159797948530343
