In [1]:
import numpy as np

## 词转化为向量

In [2]:
# 创建所有词的列表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)    # 并集
    return list(vocabSet)

# 判断 vocabList 中的单词在 inputSet 中是否出现
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)    # 设置与词汇表等长的0向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print('the word: %s is not in my Vocabulary!' % word)
    return returnVec

# 词袋模型
# 判断 vocabList 中的单词在 inputSet 中是否出现
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)    # 设置与词汇表等长的0向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        else:
            print('the word: %s is not in my Vocabulary!' % word)
    return returnVec

## 文件解析

In [3]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]    # 去除长度太短的无效字符串，取消首字母大写

## 朴素贝叶斯分类器训练函数
伪代码如下：  
计算每个类别中文档数目  
对每篇文章训练文档：  
&ensp;&ensp;&ensp;&ensp;对每个类别：  
&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;如果词条出现在文档中——>增加该词条计数值  
&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;增加所有词条计数值  
对每个类别：  
&ensp;&ensp;&ensp;&ensp;对每个词条：  
&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;将该词条的数目除以总词条的数目的到条件概率  
返回每个类别的条件概率

In [4]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)    # 计算侮辱性文章的概率
    # 初始化概率, 防止概率为0，设置词初试次数为1，分母为加上类别数量，这里加2
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):    # 遍历文档
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]    # 侮辱性文章包含的词汇加1
            p1Denom += sum(trainMatrix[i])    # 总词数加1
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)    # 除以类别总词数, 为了防止数字过小下溢，使用log函数
    p0Vect = np.log(p0Num/p0Denom)
    return p0Vect, p1Vect, pAbusive

## 朴素贝叶斯分类函数

In [5]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    # 转换为log之后原先的乘法变加法,条件独立的乘法变sum
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

## 垃圾邮件测试函数

In [6]:
def spamTest():
    docList = []
    classList = []
    fullText = []
    # 导入文件，进行存储
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)    # 字典
    trainingSet = list(range(50))
    testSet = []
    # 随机选择10个做测试集
    for i in range(10):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        # 构建词向量，加入到测试集
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    # 计算概率
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    # 遍历测试集
    for docIndex in testSet:
        # 构建词向量
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        # 分类并检测是否正确
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
    return float(errorCount)/len(testSet)

In [7]:
error = 0.0
for i in range(20):
    error += spamTest()
mean = error/20
print('the accuracy rate of the model is: %d%%' % ((1-mean)*100))

  return _compile(pattern, flags).split(string, maxsplit)


the error rate is:  0.0
the error rate is:  0.1
the error rate is:  0.0
the error rate is:  0.3
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.1
the error rate is:  0.0
the error rate is:  0.2
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.1
the accuracy rate of the model is: 96%
