In [1]:
# Imports
import re
import math
from nltk.stem import WordNetLemmatizer # pip install nltk
from nltk.corpus import stopwords
from collections import Counter

In [6]:
# Read and process data
def process_data(text):
    no_punct_string = re.sub(r'[^\w\s]', '', text)
    return no_punct_string.lower()
    
with open('dataset', 'r') as infine:
    lines = infine.readlines()
    lines = [line.rstrip() for line in lines]
    labels = []
    data = []
    for line in lines:
        line_contents = line.split('\t')
        labels.append(line_contents[0])
        data.append(process_data(line_contents[1]))
print('Unique labels are', set(labels))
print('Len of data is', len(data))

IndexError: list index out of range

In [None]:
# Class for Multinomial Naive Bayes 
class MultiNomialNB():
    def __init__(self, spam_data, ham_data):
        self.cnt_spam, self.cnt_ham = self.build_counts(spam_data, ham_data)
        self.spam_words_count = self.get_count(self.cnt_spam)  # Total number of features in the spam class
        self.ham_words_count = self.get_count(self.cnt_ham) # Total number of features in the ham class
        self.vocab_count = len(self.cnt_spam) + len(self.cnt_ham) # Total number of words accross all documents
        self.spam_count = len(spam_data) # Total number of spam documents
        self.ham_count = len(ham_data)
        self.doc_count = self.spam_count + self.ham_count # Total number of all features

    def get_count(self, count_dict):
        """
        Gives the total number of words for a given class
        """
        tot_count = 0
        for key, value in count_dict.items():
            tot_count += value
        return tot_count

    def build_counts(self, spam_data, ham_data):
        """
        Builds the feature dictionary for every class
        """
        wordnet_lemmatizer = WordNetLemmatizer()
        stops = set(stopwords.words('english'))
        cnt_spam = Counter()
        cnt_ham = Counter()
        for spam_texts in spam_data:
            spam_words = [wordnet_lemmatizer.lemmatize(word) for word in spam_texts.split(' ') if word not in stops and word is not '']
            for word in spam_words:
                cnt_spam[word] += 1
        for ham_texts in ham_data:
            ham_words = [wordnet_lemmatizer.lemmatize(word) for word in ham_texts.split(' ') if word not in stops and word is not '']
            for word in ham_words:
                cnt_ham[word] += 1
        return cnt_spam, cnt_ham
    def train(self):
        """
        Creates the feature vector for each class consisting of word -> smoothed proability of occurence
        Also, computes the apriori probabilities for each class
        """
        self.features = {}
        self.features['spam_features'] = {}
        self.features['ham_features'] = {}

        # Setting the a priori class probablities
        self.priorLogSpam = math.log(self.spam_count/self.doc_count)
        self.priorLogHam = math.log(self.ham_count/self.doc_count)

        # Probablity of each feature in each class
        for word, count in self.cnt_spam.items():
            self.features['spam_features'][word] = math.log((count+1)/(self.spam_words_count+self.vocab_count))
        for word, count in self.cnt_ham.items():
            self.features['ham_features'][word] = math.log((count+1)/(self.ham_words_count+self.vocab_count))

    def test(self, document):
        """
        Takes a document and predicts whether spam or ham
        """
        wordnet_lemmatizer = WordNetLemmatizer()
        stops = set(stopwords.words('english'))
        document = [wordnet_lemmatizer.lemmatize(x) for x in document.split(" ") if x not in stops and x is not '']
        spam_val = self.priorLogSpam
        ham_val = self.priorLogHam

        # Initializing the smooth probabilites
        smooth_spam = math.log(1/(self.spam_words_count+self.doc_count))
        smooth_ham = math.log(1/(self.ham_words_count+self.doc_count))

        # Updating the scores for each class
        # Spam Class
        for word in document:
            if word in self.features['spam_features']:
                spam_val += self.features['spam_features'][word]
            elif word in self.features['ham_features']:
                spam_val += smooth_spam
        # Ham Class
        for word in document:
            if word in self.features['ham_features']:
                ham_val += self.features['ham_features'][word]
            elif word in self.features['spam_features']:
                ham_val += smooth_ham
        if spam_val >= ham_val:
            return ('spam', spam_val)
        else:
            return ('ham', ham_val)

In [None]:
#TODO: Implement k-fold cross-validation code
kfold = 5
fold_size = len(data)//kfold
fold_indices = []
for fold in range(kfold):
    fold_indices.append(fold*fold_size)
fold_indices.append(len(data))
print('K-Fold indices', fold_indices)

def get_kfold_data(fold):
    test_range = fold_indices[fold], fold_indices[fold+1]
    test_data, test_labels = data[test_range[0]:test_range[1]], labels[test_range[0]:test_range[1]]
    train_data, train_labels = [], []
    for i in range(kfold):
        if fold == i:
            continue
        else:
            train_range = fold_indices[i], fold_indices[i+1]
            train_data.extend(data[train_range[0]:train_range[1]])
            train_labels.extend(labels[train_range[0]:train_range[1]])
    assert len(train_data) + len(test_data) == len(data)
    return (train_data, train_labels), (test_data, test_labels)

def get_train_ham_spam(train_data, train_labels):
    # Splitting data into spam and ham
    train_spam_data = []
    train_ham_data = []
    for i, text in enumerate(train_data):
        if train_labels[i] == 'spam':
            train_spam_data.append(text)
        elif train_labels[i] == 'ham':
            train_ham_data.append(text)
        else:
            print('Labels not extracted properly')
    assert len(train_spam_data) + len(train_ham_data) == len(train_data)
    return train_spam_data, train_ham_data

In [3]:
# Auxilliary Functions
# Getting predictions on the test set
def get_predictions(test_data, multinb_classifer):
    pred_labels = []
    pred_scores = []
    for test_doc in test_data:
        pred_label, pred_score = multinb_classifer.test(test_doc)
        pred_labels.append(pred_label)
        pred_scores.append(pred_score)
    return pred_labels, pred_scores

def get_accuracy(pred_labels, test_labels):
    # Calculating accuracy
    correct = 0
    wrong_indices = []
    for i, pred_label in enumerate(pred_labels):
        if pred_label == test_labels[i]:
            correct += 1
        else:
            wrong_indices.append(i)
    accuracy = (correct/len(pred_labels))*100
    return accuracy

def print_classification_matrix(pred_labels, test_labels):
    orig_spam = 0
    orig_ham = 0
    for label in test_labels:
        if label == 'spam':
            orig_spam += 1
        elif label == 'ham':
            orig_ham += 1
    spam_as_ham = 0
    ham_as_spam = 0
    for i, pred_label in enumerate(pred_labels):
        if pred_label != test_labels[i]:
            if test_labels[i] == 'spam':
                spam_as_ham += 1
            elif test_labels[i] == 'ham':
                ham_as_spam += 1
    spam_as_spam = orig_spam - spam_as_ham
    ham_as_ham = orig_ham - ham_as_spam
    print("Classification Matrix\n")
    print("\tSpam\tHam")
    print("Spam\t", spam_as_spam, "\t", spam_as_ham)
    print("Ham\t", ham_as_spam, "\t", ham_as_ham)

In [4]:
# Implementing k-fold
total_pred_labels = []
total_test_labels = []
for fold in range(kfold):
    (train_data, train_labels), (test_data, test_labels) = get_kfold_data(fold)
    train_spam_data, train_ham_data = get_train_ham_spam(train_data, train_labels)
    multinb_classifer = MultiNomialNB(train_spam_data, train_ham_data)
    multinb_classifer.train() # Training  
    pred_labels, pred_scores = get_predictions(test_data, multinb_classifer) # Predicting on the test set
    total_pred_labels.extend(pred_labels)
    total_test_labels.extend(test_labels)
# Calcuating the accuracy 
accuracy = get_accuracy(total_pred_labels, total_test_labels)
print_classification_matrix(total_pred_labels, total_test_labels)
print('Accuracy of k-Fold cross-validated Multinomial Naive Bayes is', accuracy)

NameError: name 'kfold' is not defined