# 2. 使用朴素贝叶斯过滤垃圾邮件

In [1]:
def createVocabList(dataset):
    '''创建一个包含在所有文档中出现的不重复词的列表'''
    # 创建一个空集
    vocabSet = set([])
    for document in dataset:
        # | 集合求并集
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    '''
    词集模型
    输入：词汇表以及某个文档
    输出：文档向量，向量中的数字代表词汇表中的单词在输入文档中出现的次数
    '''
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

from numpy import *
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    # 拉普拉斯修正
    p0Num = ones(numWords); p1Num = ones(numWords)
    p0Denom = 2.0; p1Denom = 2.0
    for i in xrange(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # 取log防止乘法下溢
    p1Vect = log(p1Num / p1Denom)
    p0Vect = log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    '''pClass1 属于class1类别的概率'''
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1-pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

In [2]:
def textParse(bigString):
    import re
    # 正则表达式，分隔符是除单词、数字外的任意字符串
    listOfTokens = re.split(r'\W*', bigString)
    # 过滤掉长度小于3的字符串，统一成小写形式
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

In [3]:
def spamTest():
    docList = []; classList = []; fullText = []
    for i in xrange(1, 26):
        wordList = textParse(open('./email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    # 创建词列表
    vocabList = createVocabList(docList)
    trainingSet = range(50); testSet = []
    # 抽取10个作为测试集，其余的作为训练集
    for i in xrange(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(trainMat, trainClasses)
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    error_rate = float(errorCount)/len(testSet)
    print("the error rate is: ", error_rate)
    return error_rate

In [4]:
error = 0
for i in xrange(10):
    error += spamTest()
print("Sum, the error rate is: ", error/10)

('the error rate is: ', 0.0)
('the error rate is: ', 0.0)
('the error rate is: ', 0.1)
('the error rate is: ', 0.0)
('the error rate is: ', 0.0)
('the error rate is: ', 0.0)
('the error rate is: ', 0.1)
('the error rate is: ', 0.1)
('the error rate is: ', 0.1)
('the error rate is: ', 0.1)
('Sum, the error rate is: ', 0.05)
