In [1]:
import numpy as np

In [2]:
def loadDataSet():  
    postingList=[['my','dog','has','flea','problem','help','please'],  
                 ['maybe','not','take','him','to','dog','park','stupid'],  
                 ['my','dalmation','is','so','cute','I','love','him'],  
                 ['stop','posting','ate','my','steak','how','to','stop','him'],  
                 ['mr','licks','ate','my','steak','how','to','stop','him'],  
                 ['quit','buying','worthless','dog','food','stupid']]  
    classVec=[0,1,0,1,0,1]  
    return postingList,classVec 

In [3]:
def createVocabList(dataSet):
    vocabSet = set([]) 
    for document in dataSet:
        vocabSet = set(document) | vocabSet # 取并集用于构建集合
    return list(vocabSet)

In [4]:
def setOfWordsToVect(vocabList,inputSet):
    returnVec = [0]*len(vocabList) # 单词向量和单词列表长度相同
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 # 词集模型
        else:
            print("单词：%s 不存在于单词表中"%word)
    return returnVec

def bagOfWordsToVect(vocabList,inputSet):
    returnVec = [0]*len(vocabList) # 单词向量和单词列表长度相同
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1 # 词袋模型
        else:
            print("单词：%s 不存在于单词表中"%word)
    return returnVec

In [5]:
listOposts,listClasses  = loadDataSet()
vocabList = createVocabList(listOposts)
print(vocabList)

['him', 'park', 'help', 'maybe', 'stupid', 'posting', 'stop', 'cute', 'is', 'steak', 'not', 'take', 'mr', 'dalmation', 'has', 'so', 'ate', 'worthless', 'I', 'buying', 'quit', 'to', 'dog', 'please', 'love', 'my', 'how', 'licks', 'food', 'problem', 'flea']


In [6]:
print(setOfWordsToVect(vocabList, listOposts[0]))

[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1]


In [7]:
def trainNB0(trainMatrix,trainCategory): # 注意输入值为np.ndarray类型
    numTrainDocs = len(trainMatrix)
    pAbusive = np.sum(trainCategory)/numTrainDocs # 分类为1的p(c1)
    numWords = len(trainMatrix[0])
#     p0Num = np.zeros(numWords) # 各类别中每个单词出现次数统计结果列表
#     p1Num = np.zeros(numWords)
#     p0Denom = 0.0 # 各类别中单词的总数
#     p1Denom = 0.0
    p0Num = np.ones(numWords) # 各类别中每个单词出现次数统计结果列表 # 理论上解决了一个p(wj | ci)结果为0，导致全部结果为0
    p1Num = np.ones(numWords)
    p0Denom = 2 # 各类别中单词的总数
    p1Denom = 2
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i]) 
    # 取对数：解决多次连乘导致的下溢出，同时解决了一个p(wj | ci)结果为0，导致全部结果为0，
    # 否则在做预测时就需要将词向量为0对应的pVect计算项剔除，避免了结果乘积为0
    p0Vect = np.log(p0Num/p0Denom)
    p1Vect = np.log(p1Num/p1Denom)
    return p0Vect,p1Vect,pAbusive
    
# 如何实现一个多分类的贝叶斯分类器

def trainNB0M(trainMatrix,trainCategory): # 注意输入值为np.ndarray类型
    numTrainDocs = len(trainMatrix)
    classList = set(trainCategory) # 构建类别列表
    classNum = np.array([trainCategory.count(category) for category in classList])
    pClass = classNum/numTrainDocs # 分类为n的p(Cn)
    numWords = len(trainMatrix[0])
    pCateNum = np.ones((classNum,numWords)) # 各类别中每个单词出现次数统计结果列表
    pCateDenom = np.ones(classNum)*classNum # 各类别中单词的总数  # 为防止广播失效，最好将classNum构建成[1,n]维度矩阵而不是n维数组
    for i in range(numTrainDocs):
        for cate in classList:
            if trainCategory[i] == cate:
                pCateNum[cate] += trainMatrix[i]
                pCateDenom += sum(trainMatrix[i])
    pCateVect = np.log(pCateNum/pCateDenom)
    return pCateVect,pClass

In [8]:
def classify(vect2Classify,p0Vect,p1Vect,pClass1):
    p0 = sum(vect2Classify*p0Vect)+np.log(1-pClass1)
    p1 = sum(vect2Classify*p1Vect)+np.log(pClass1)
    if p1 > p0:
        return 1
    else:
        return 0
    

In [9]:
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWordsToVect(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    testEntry = ['love','my','dalmation']
    testDoc  = np.array(setOfWordsToVect(myVocabList,testEntry))
    print(str(testEntry)+" 分类为：%d"%classify(testDoc,p0V,p1V,pAb))
    testEntry = ['stupid','garbage']
    testDoc  = np.array(setOfWordsToVect(myVocabList,testEntry))
    print(str(testEntry)+" 分类为：%d"%classify(testDoc,p0V,p1V,pAb))

In [10]:
testingNB()

['love', 'my', 'dalmation'] 分类为：0
单词：garbage 不存在于单词表中
['stupid', 'garbage'] 分类为：1


# 邮件分类问题

In [11]:
def textParse(bigString):
    import re 
    listOfTokens = re.split(r'\W*',bigString) # 对比python str.split() 有效剔除了其中标点符号，但是依然存在空白字符 
    return [token.lower() for token in listOfTokens if len(token) > 2] # 通过字长去除空白字符，以及高频介词，缩略语等

In [21]:
def spamTest():
    docList = []
    classList = []
    fullText = []
    # 将所有邮件构建成多维List
    for i in range(1,26):
        wordList = textParse(open('./email/spam/%d.txt'%i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./email/ham/%d.txt'%i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    # 构建词列表
    vocabList = createVocabList(docList)
    # 随机挑选构建训练集
    # 剩余数据用于测试
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex]) # 这一步多余？testSet.append(randIndex)
        del(trainingSet[randIndex])
    # 构建训练矩阵
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWordsToVect(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    # 训练模型 -> 计算贝叶斯方程中的条件概率
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWordsToVect(vocabList,docList[docIndex])
        if classify(wordVector,p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1 # 简写得需要先初始化，不然初始值不知道，无法做运算
            print(classify(wordVector,p0V,p1V,pSpam),classList[docIndex])
            print ("classfication error", docList[docIndex])
    n = errorCount/len(testSet)
    print( '错误率是: %f'%n )
    return p0V,p1V,vocabList


In [22]:
p0V,p1V,vocabList = spamTest()

错误率是: 0.000000


  return _compile(pattern, flags).split(string, maxsplit)


In [26]:
from pyecharts import WordCloud
wordcloud1 = WordCloud(width=1300, height=620)
wordcloud1.add("", vocabList, p1V, word_size_range=[20, 100])
wordcloud2 = WordCloud(width=1300, height=620)
wordcloud2.add("", vocabList, p1V, word_size_range=[20, 100])
wordcloud1

In [27]:
wordcloud2

In [14]:
# 统计最高频单词并从单词表中删除

# 停用词表

# 可以通过分析p0V,p1V来确定两个分类中不同单词的出现概率，这也是naive bayes 的一种使用方式