# 朴素贝叶斯

<font size=4>
$p(c_i|x,y)=\frac{p(x,y|c_i)p(c_i)}{p(x,y)}$
</font>

### 从文本中构建词向量

In [1]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1] 
    return postingList, classVec

In [2]:
def createVocabList(dataSet):
    vocabSet = set() 
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [3]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: 
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [4]:
listOPosts, listClasses = loadDataSet()

In [5]:
myVocabList = createVocabList(listOPosts)
myVocabList

['cute',
 'love',
 'has',
 'flea',
 'problems',
 'stupid',
 'is',
 'my',
 'licks',
 'ate',
 'buying',
 'mr',
 'to',
 'dalmation',
 'so',
 'maybe',
 'park',
 'please',
 'dog',
 'posting',
 'him',
 'food',
 'steak',
 'help',
 'not',
 'take',
 'stop',
 'I',
 'quit',
 'how',
 'garbage',
 'worthless']

In [6]:
setOfWords2Vec(myVocabList, listOPosts[0])

[0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [7]:
setOfWords2Vec(myVocabList, listOPosts[3])

[0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1]

## 从词向量计算概率

<font size=4>
$p(c_i|w)=\frac{p(w|c_i)p(c_i)}{p(w)}$
</font>

### 朴素贝叶斯分类器训练函数

In [8]:
import numpy as np

In [9]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)  
    p0Denom = 2.0
    p1Denom = 2.0                    
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += np.sum(trainMatrix[i])
    p1Vect = np.log(p1Num / p1Denom)     
    p0Vect = np.log(p0Num / p0Denom)    
    return p0Vect, p1Vect, pAbusive

In [10]:
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [11]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [12]:
pAb

0.5

In [13]:
p0V

array([-2.56494936, -2.56494936, -2.56494936, -2.56494936, -2.56494936,
       -3.25809654, -2.56494936, -1.87180218, -2.56494936, -2.56494936,
       -3.25809654, -2.56494936, -2.56494936, -2.56494936, -2.56494936,
       -3.25809654, -3.25809654, -2.56494936, -2.56494936, -3.25809654,
       -2.15948425, -3.25809654, -2.56494936, -2.56494936, -3.25809654,
       -3.25809654, -2.56494936, -2.56494936, -3.25809654, -2.56494936,
       -3.25809654, -3.25809654])

In [14]:
p1V

array([-3.04452244, -3.04452244, -3.04452244, -3.04452244, -3.04452244,
       -1.65822808, -3.04452244, -3.04452244, -3.04452244, -3.04452244,
       -2.35137526, -3.04452244, -2.35137526, -3.04452244, -3.04452244,
       -2.35137526, -2.35137526, -3.04452244, -1.94591015, -2.35137526,
       -2.35137526, -2.35137526, -3.04452244, -3.04452244, -2.35137526,
       -2.35137526, -2.35137526, -3.04452244, -2.35137526, -3.04452244,
       -2.35137526, -1.94591015])

### 朴素贝叶斯分类函数

In [15]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1) 
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0

In [16]:
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(str(testEntry) + 'classified as: ' + str(classifyNB(thisDoc, p0V, p1V, pAb)))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(str(testEntry) + 'classified as: ' + str(classifyNB(thisDoc, p0V, p1V, pAb)))

In [17]:
testingNB()

['love', 'my', 'dalmation']classified as: 0
['stupid', 'garbage']classified as: 1


### 文档词袋模型

In [18]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

## 使用朴素贝叶斯过滤垃圾邮件

In [19]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon'
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [20]:
import re
regEx = re.compile('\W*')
listOfTokens = regEx.split(mySent)
listOfTokens

  This is separate from the ipykernel package so we can avoid doing imports until


['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [21]:
[tok.lower() for tok in listOfTokens]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

### 使用朴素贝叶斯进行交叉验证

In [22]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 

In [23]:
def spamTest():
    docList = []
    classList = [] 
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i, 'rb+').read().decode('utf-8', 'ignore'))
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i, 'rb+').read().decode('utf-8', 'ignore'))
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []           
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error" + str(docList[docIndex]))
    print('the error rate is: ' + str(float(errorCount) / len(testSet)))

In [24]:
spamTest()

classification error['experience', 'with', 'biggerpenis', 'today', 'grow', 'inches', 'more', 'the', 'safest', 'most', 'effective', 'methods', 'of_penisen1argement', 'save', 'your', 'time', 'and', 'money', 'bettererections', 'with', 'effective', 'ma1eenhancement', 'products', 'ma1eenhancement', 'supplement', 'trusted', 'millions', 'buy', 'today']
the error rate is: 0.1


  return _compile(pattern, flags).split(string, maxsplit)


## 朴素贝叶斯分类器 RSS

In [25]:
import feedparser
nasa = feedparser.parse('feed://www.nasa.gov/rss/dyn/image_of_the_day.rss')
len(nasa['entries'])

60

In [26]:
news = feedparser.parse('http://www.independent.co.uk/news/uk/rss')
len(news['entries'])

27

In [27]:
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) 
    return sortedFreq[:30] 

In [28]:
def localWords(feed1,feed0):
    import feedparser
    docList=[]
    classList = []
    fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)   
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = list(range(2 * minLen))
    testSet=[]          
    for i in range(20):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ' + str(float(errorCount) / len(testSet)))
    return vocabList, p0V, p1V

In [29]:
vocabList, p_nasa, p_news =  localWords(nasa, news)

the error rate is: 0.5


In [30]:
vocabList, p_nasa, p_news =  localWords(nasa, news)

the error rate is: 0.55


In [31]:
vocabList, p_nasa, p_news =  localWords(nasa, news)

the error rate is: 0.45


In [32]:
def getTopWords(p0V,p1V):
    import operator
    vocabList, p0V, p1V = localWords(p0V, p1V)
    top_p0V=[]
    top_p1V=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : 
            top_p1V.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0 : 
            top_p0V.append((vocabList[i], p1V[i]))
    sortedp1V = sorted(top_p1V, key=lambda pair: pair[1], reverse=True)
    print("p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**")
    for item in sortedp1V:
        print(item[0])
    sortedp0V = sorted(top_p0V, key=lambda pair: pair[1], reverse=True)
    print("p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**p0V**")
    for item in sortedp0V:
        print(item[0])

In [33]:
getTopWords(nasa, news)

the error rate is: 0.4
p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**p1V**
office
second
need
police
only
not
now
life
some
becoming
modern
amber
slip
firefighters
shadow
condemnation
danger
tragic
west
decades
yet
rules
europe
street
leader
unacceptable
two
have
stagnating
into
accident
centres
investigation
growing
information
letter
announces
insists
rates
family
expectancy
trace
wake
video
ever
countries
arlene
momentum
labour
wilderness
parts
blame
memories
2015
productive
issue
employees
previously
cartwright
latest
many
before
downing
animated
forecast
took
greenpeace
joe
hours
said
weather
closure
british
public
stronger
north
wales
fertility
ninety
fierce
them
judge
living
home
setting
campsfield
older
divided
remembrance
repurposed
spent
thought
history
metropolitan
don
look
johnson
twenty
panic
all
entered
trend
raf
mean
longest
bells
foster
work
referendum
new
because
war
interpol
increases
keep
falling
navigator
south
met
following
via
commente