In [100]:
# Author: Yilin ZHENG
# import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from collections import Counter
from math import log
from functools import reduce

In [19]:
# directories
train_dir = "./train-mails/"
test_dir = "./test-mails/"

# parameters
most = 3000

1. Prepare the data

Ohhhh, it had been done.

2. Create word dictionary

In [20]:
def make_dictionary(train_dir):
    emails = glob.glob(os.path.join(train_dir, "*"))
    all_words = []
    for mail in emails:
        with open(mail) as m:
            for i, line in enumerate(m):
                if i == 2:  # Body of the email
                    words = line.split()
                    all_words += words
    dictionary = Counter(all_words)
    list_to_remove = dictionary.keys()
    for item in list(list_to_remove):
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(most)
    return dictionary

In [21]:
dictionary = make_dictionary(train_dir)

3. Extract features

In [22]:
def extract_features(mail_dir):
    files = glob.glob(os.path.join(mail_dir, "*"))
    feature_matrix = np.zeros((len(files), most))
    doc_id = 0
    for file in files:
        with open(file) as f:
            for i, line in enumerate(f):
                if i == 2:
                    words = line.split()
                    for word in words:
                        word_id = 0
                        for i, d in enumerate(dictionary):
                            if d[0] == word:
                                word_id = i
                                feature_matrix[doc_id, word_id] = words.count(word)
        doc_id += 1
    return feature_matrix

In [91]:
# get matrix
train_matrix = extract_features(train_dir)
test_matrix = extract_features(test_dir)

In [95]:
# get train labels
train_set = glob.glob(os.path.join(train_dir, "*"))
train_labels = [1 if data.find("spmsg") == -1 else 0 for data in train_set]
# get test labels
test_set = glob.glob(os.path.join(test_dir, "*"))
test_labels = [1 if data.find("spmsg") == -1 else 0 for data in test_set]

4. Naïve Bayes Classifier

Use sklearn Bayes classifier for a comparison

In [None]:
# use sklearn Bayes classifier to compare my own implementation
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, f1_score, accuracy_score
GNB = MultinomialNB() #MultinomialNB()
GNB.fit(train_matrix, train_labels)
y_pred = GNB.predict(test_matrix)
accuracy = accuracy_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred)
print("accuracy: {:05.2f}%, recall: {:05.2f}%, F-1: {:05.2f}%" \
      .format(100 * accuracy, 100 * recall, 100 * f1))

My own implementation of Beyes classifier

In [97]:
# define my own multinomial Naïve Bayes classifier
class Naive_Bayes:
    
    def __init__(self):
        pass
    
    def fit(self, train_data, train_labels):
        # train data, labels, and classes
        self.data = train_data
        self.labels = train_labels
        self.feature_indices = [x for x in range(1, len(train_data[0])+1)]
        # count positive and negative instances
        self.pos_count = self.labels.count(1)
        self.neg_count = self.labels.count(0)
        self.total = len(train_labels)
        self.pos = []
        self.neg = []
        for feature, label in zip(self.data, self.labels):
            if label == 1:
                self.pos.append(feature)
            elif label == 0:
                self.neg.append(feature)
            else:
                pass
        # feature
        self.features = {"pos": [], "neg": []}
        # calculate prior
        self.prior_pos = self.pos_count/self.total
        self.prior_neg = self.neg_count/self.total
        # calcuate probabilities of each feature for each class.
        self.features["pos"] = np.sum([[1 if feature[idx-1] > 0 else 0 for idx in self.feature_indices] \
                for feature in self.pos], axis=0)/self.pos_count
        self.features["neg"] = np.sum([[1 if feature[idx-1] > 0 else 0 for idx in self.feature_indices] \
                for feature in self.neg], axis=0)/self.neg_count

    def predict(self, test_data, test_labels):
        self.test_data = test_data
        self.test_labels = test_labels
        self.prediction = []
        for data in self.test_data:
            index_record = []
            for index in self.feature_indices:
                if data[index-1] > 0:
                    index_record.append(index)
            pos_prob = self.prior_pos * \
                    reduce(lambda x, y: x*y, [self.features["pos"][index-1] for index in index_record])
            neg_prob = self.prior_neg * \
                    reduce(lambda x, y: x*y, [self.features["neg"][index-1] for index in index_record])
            if pos_prob >= neg_prob:
                self.prediction.append(1)
            else:
                self.prediction.append(0)
        # calculate metrics
        self.correct_pred = sum([1 if test == pred else 0 for test, pred in \
                                 zip(self.test_labels, self.prediction)])
        self.TP = sum([1 if test == 1 and pred == 1 else 0 for test, pred in \
                       zip(self.test_labels, self.prediction)])
        self.TN = sum([1 if test == 0 and pred == 0 else 0 for test, pred in \
                       zip(self.test_labels, self.prediction)])
        self.FN = sum([1 if test == 1 and pred == 0 else 0 for test, pred in \
                       zip(self.test_labels, self.prediction)])
        self.FP = sum([1 if test == 0 and pred == 1 else 0 for test, pred in \
                       zip(self.test_labels, self.prediction)])
        return self.prediction
    
    def accuracy(self):
        return self.correct_pred / len(self.test_labels)
    
    def recall(self):
        return self.TP / (self.TP + self.FN)
    
    def F1(self):
        return 2 * self.TP / (2 * self.TP + self.FP + self.FN)

In [98]:
model = Naive_Bayes()
model.fit(train_matrix, train_labels)
y_pred2 = model.predict(test_matrix, test_labels)
print("accuracy: {:05.2f}%, recall: {:05.2f}%, F-1: {:05.2f}%" \
      .format(100 * model.accuracy(), 100 * model.recall(), 100 * model.F1()))

accuracy: 84.23%, recall: 99.23%, F-1: 86.29%
