#   4.基于概率论的分类方法：朴素贝叶斯
###  优点：在数据较少的情况下仍然有效，可以处理多类别问题。
###    缺点：对于输入数据的准备方式比较敏感。
###    适用数据类型：标称型数据（有限）。
###    注释：“朴素”，这里指整个形式化过程只做最原始、最简单的假设。¶

# 4.1使用Python进行文本分类

## 4.1.1 准备数据：从文本中构建词向量

### 4-1 词表到向量的转化函数

In [1]:
def loadDataSet():
    postingList = [['my','dog','has','flea','problems','help','please'],
                   ['maybe','not','take','him','to','dog','park','stupid'],
                   ['my','dalmation','is','so','cute','I','love','him'],
                   ['stop','posting','stupid','worthless','garbage'],
                   ['mr','licks','ate','my','steak','how','to','stop','him'],
                   ['quit','buying','worthless','dog','food','stupid']]  # 定义邮件列表
    classVec = [0,1,0,1,0,1]  # 1 代表侮辱性文字， 0 代表正常言论
    return postingList, classVec

def createVocabList(dataSet):  # 创建词汇列表
    vocabSet = set([])  # 定义词汇集
    for document in dataSet:  # 遍历文档
        vocabSet = vocabSet | set(document)  # 将每个document合并到vocabSet，|用来联合两个集合
    return list(vocabSet)  
    
def setOfWords2Vec(vocabList, inputSet):  # 把单词转换成向量
    returnVec = [0]*len(vocabList)  # 定义要返回的向量
    for word in inputSet:   # 遍历输出集中的单词
        if word in vocabList:   # 单词在词汇集中
            returnVec[vocabList.index(word)] = 1   # 对应的位置设为1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [2]:
def bagOfWords2VecMN(vocabList, inputSet):   # 把单词转换成向量,用词袋模型，计算词出现的次数
    returnVec = [0]*len(vocabList)   # 定义要返回的向量
    for word in inputSet:  # 遍历输出集中的单词
        if word in vocabList:  # 单词在词汇集中
            returnVec[vocabList.index(word)] += 1    #对应的词出现次数 加1
    return returnVec

## 4.1.2 训练算法：从此向量计算概率
### 4-2 朴素贝叶斯分类器训练函数

In [3]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)   # 计算文档的数目
    numWords = len(trainMatrix[0])   # 计算单词的数目
    pAbusive = sum(trainCategory)/float(numTrainDocs)   # 计算类别的概率，abusive为1，not abusive为0
    p0Num = zeros(numWords)   # 初始化计数器，p0是not abusive
    p1Num = zeros(numWords) # 初始化计数器
    p0Denom = 0.0  # 初始化分母
    p1Denom = 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:   # 计算abusive对应的词汇的数目，trainMatrix为0-1值形成的向量
            p1Num += trainMatrix[i]    # p1Num存储的是每个词出现的次数
            p1Denom += sum(trainMatrix[i])   # p1Denom存储的是词的总数目
        else:
            p0Num += trainMatrix[i]   # 每个词在not abusive下出现的次数
            p0Denom += sum(trainMatrix[i])   # not abusive下的总词数
    p1Vect = p1Num/p1Denom   # change to log()   # 计算abusive下每个词出现的概率
    p0Vect = p0Num/p0Denom   # change to log()   # 计算not abusive下每个词出现的概率
    return p0Vect,p1Vect,pAbusive

## 4.1.3 测试算法：根据现实情况修改分类器
### 4-3 朴素贝叶斯分类函数

In [4]:
from numpy import * 
def classifyNB(vec2Classify, p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)  # log()默认以e为底     ### 计算abusive的概率
    p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)  # 计算not abusive的概率
    if p1 > p0:
        return 1
    else:
        return 0
    
    
def testingNB():
    listOPosts, listClasses = loadDataSet()   
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love','my','dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,"classified as:",classifyNB(thisDoc,p0V,p1V,pAb))
    
    testEntry = ['stupid','garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,"classified as:",classifyNB(thisDoc,p0V,p1V,pAb))

In [5]:
testingNB()

['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as: 1


# 4.2 示例：使用朴素贝叶斯过滤垃圾邮件

## 4.2.1 准备数据：切分文本

In [6]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
mySent1 = mySent.split()  #  问题： 标点符号也被当成此的一部分
print(mySent1)

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']


In [7]:
# 解决标点符号也被当成此的一部分的问题  ———— 使用正则表示式，分隔符是除了 |单词， 数字| 外的任意字符串
import re
regEx = re.compile('\\W*')
listOfToken = regEx.split(mySent)
print(listOfToken)
a = [tok for tok in listOfToken if len(tok) > 0]  # 除去 空字符
print(a)
b = [tok.lower() for tok in listOfToken if len(tok) > 0]
print(b)

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']
['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon']
['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']


  after removing the cwd from sys.path.


## 4.2.2 测试算法：使用朴素贝叶斯进行交叉验证
### 4-5 文件解析以及完整的垃圾邮件测试函数

In [8]:
def textParse(bigString):  # 文本解析   ### 输入是字符串，输出是单词列表
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList = []  # 定义docList文档列表
    classList = []  # classList类别列表
    fullText = []   # fullText所有文档词汇
    for i in range(1,26):
        # 定义并读取垃圾邮件文件的词汇分割列表
        wordList = textParse(open('email/spam/%d.txt' % i).read())   # !!! 为了避免文件中非法字符的影响，尽量用下一行
        #wordList = textParse(open('email/spam/%d.txt' % i, "rb").read().decode('GBK','ignore'))
        
        docList.append(wordList)   # 将词汇列表加到文档列表中
        fullText.extend(wordList)   # 将所有词汇列表汇总到fullText中
        classList.append(1)   # 文档类别为1，spam
        
        #wordList = textParse(open('email/ham/%d.txt' % i).read())   # !!! 原书本中 bug 因为有可能文件中存在类似“�”非法字符。
        # 提示：UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multibyte sequence
        wordList = textParse(open('email/ham/%d.txt' % i,  "rb").read().decode('GBK','ignore') )  # 读取非垃圾邮件的文档
        
        docList.append(wordList)  
        fullText.extend(wordList)
        classList.append(0)  # 类别为0，非垃圾邮件
        
    vocabList = createVocabList(docList)
    #trainingSet = range(50)  # !!! 原书本中 python2.7   提示：TypeError: 'range' object doesn't support item deletion
    #因为是python3中range不返回数组对象，而是返回range对象 
    trainingSet = list(range(50))   # 定义训练集的索引和测试集
    testSet = []
    for i in range(10):   # 随机的选择10个作为测试集
        randIndex = int(random.uniform(0,len(trainingSet)))  # 随机索引
        testSet.append(trainingSet[randIndex])  # 将随机选择的文档加入到测试集中
        del(trainingSet[randIndex])  # 从训练集中删除随机选择的文档
    trainMat = []   # 定义训练集的矩阵和类别 
    trainClasses = []
    for docIndex in trainingSet:    # 遍历训练集，求得先验概率和条件概率
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))   # 将词汇列表变为向量放到trainMat中
        trainClasses.append(classList[randIndex])   # 训练集的类别标签
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))    # 计算先验概率，条件概率
    errorCount = 0    
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])   # 将测试集词汇向量化
        if classifyNB(array(wordVector), p0V, p1V,pSpam) != classList[docIndex]:   # 对测试数据进行分类
            errorCount += 1   # 分类不正确，错误计数加1
    print("the error rate is:",float(errorCount)/len(testSet))   

In [10]:
spamTest()

the error rate is: 0.3


  after removing the cwd from sys.path.
