In [22]:
import numpy as np
import re

In [18]:
SPAM_DIR = 'email/spam/'
NONSPAM_DIR = 'email/ham/'

In [19]:
def loadDataset():
    X = []
    Y = []
    
    for i in range(1, 26):
        spam_path = '{0}{1}.txt'.format(SPAM_DIR, i)
        nonspam_path = '{0}{1}.txt'.format(NONSPAM_DIR, i)
        
        with open(spam_path, encoding='gbk') as f:
            spam = f.read()
            spamWords = re.split(r'\W+', spam)
            
        X.append(spamWords)
        Y.append(1)
            
        with open(nonspam_path, encoding='gbk') as f:
            nonspam = f.read()
            nonspamWords = re.split(r'\W+', nonspam)
            
        X.append(nonspamWords)
        Y.append(0)
        
    return X, Y

In [32]:
texts, Y = loadDataset()

In [33]:
def createVocab(X):
    vocab = set()
    
    for x in X:
        vocab = vocab | set(x)
        
    vocab = list(vocab)
    word2Index = dict(zip(vocab, range(len(vocab))))
    
    return vocab, word2Index

In [34]:
_, word2Index = createVocab(texts)

In [45]:
def createDataset(texts, word2Index):
    X = np.zeros((len(texts), len(word2Index)))
    
    for i, words in enumerate(texts):
        for word in words:
            index = word2Index[word]
            X[i, index] = 1
            
    return X

In [46]:
X = createDataset(texts, word2Index)
X.shape

(50, 844)

In [63]:
class NaiveBayes():
    
    def calProb(self, X, Y):
        count = len(X)
        vocabLen = len(X[0])
        
        spamWordsProbs = np.ones((vocabLen,))
        nonspamWordsProbs = np.ones((vocabLen,))
        
        for i in range(count):
            if Y[i] == 1:
                spamWordsProbs += X[i]
            else:
                nonspamWordsProbs += X[i]
                
        spamProbs = np.sum(Y) / count
        
        return spamWordsProbs, nonspamWordsProbs, spamProbs
    
    def fit(self, X, Y):
        self._spamWordsProbs, self._nonspamWordsProbs, self._spamProbs = self.calProb(X, Y)
        
    def predict(self, X):
        Y = []
        
        for x in X:
            spam = np.sum(x * np.log(self._spamWordsProbs / np.sum(self._spamWordsProbs))) + np.log(self._spamProbs)
            nonspam = np.sum(x * np.log(self._nonspamWordsProbs / np.sum(self._nonspamWordsProbs))) \
                        + np.log(1 - self._spamProbs)
            y = 1 if spam > nonspam else 0
            
            Y.append(y)
            
        return np.array(Y)
    
    def accuracy(self, Y, YPredict):
        return np.sum(Y == YPredict) / len(Y)

In [64]:
bayes = NaiveBayes()

In [65]:
bayes.fit(X, Y)

In [66]:
bayes.predict(X)

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0])

In [82]:
XTrain = X[:40]
YTrain = Y[:40]

XTest = X[40:]
YTest = Y[40:]

In [83]:
bayes = NaiveBayes()
bayes.fit(XTrain, YTrain)

In [84]:
YTestPredict = bayes.predict(XTest)

In [85]:
bayes.accuracy(YTest, YTestPredict)

1.0