# CH4  Naive Bayes

In [1]:
import numpy as np

词向量

## 文档[句子[词条]]

In [2]:
def loadDataSet():
    postingList = [['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthles','garbage','to','stop','him'],
                  ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1]
    return postingList, classVec

In [3]:
postingList,classVec = loadDataSet()

## 文档->词表[List]

In [4]:
def createVocabList(dataSet):
    return list(set([y for x in dataSet for y in x]))

In [5]:
myVocabList = createVocabList(postingList)

In [6]:
myVocabList

['not',
 'love',
 'buying',
 'worthless',
 'worthles',
 'please',
 'park',
 'problems',
 'my',
 'stop',
 'cute',
 'him',
 'has',
 'food',
 'to',
 'quit',
 'maybe',
 'garbage',
 'take',
 'help',
 'so',
 'posting',
 'stupid',
 'dalmation',
 'dog',
 'flea',
 'I',
 'is']

## 句子->词向量[List]

In [7]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [8]:
setOfWords2Vec(myVocabList,postingList[0])

[0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0]

## 分类器(setofword)

p(w/ci),p(ci)

In [9]:
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix) # 句子，样本数
    numWords = len(trainMatrix[0])  # 词向量长度
    pAbusive = sum(trainCategory)/float(numTrainDocs) # 类别为1的样本占比
    p0Num = np.ones(numWords);p1Num = np.ones(numWords)
    p0Denom = 2.0; p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]        # 向量：样本汇总，所有样本加到一个词向量长度的向量
            p1Denom += sum(trainMatrix[i]) # 标量：所有单词计数
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)   # 向量，训练数据中归为1类的样本统计出的，词向量中每个元素出现的概率
    p0Vect = np.log(p0Num/p0Denom)   # 
    return p0Vect, p1Vect, pAbusive

In [10]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify*p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify*p0Vec) + np.log(1-pClass1)
    return 1 if p1 > p0 else 0 # 这里可以再处理

In [11]:
def testingNB():
    listOfPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    trainMat = []
    for postinDoc in listOfPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    testEntry = ["love","my","dalmation"]
    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,"classified as: ",classifyNB(thisDoc,p0Vec=p0V,p1Vec=p1V,pClass1=pAb))
    testEntry = ["stupid","garbage"]
    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,"classified as: ",classifyNB(thisDoc,p0Vec=p0V,p1Vec=p1V,pClass1=pAb))    

In [12]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


## 分类器(bagofword)

In [13]:
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

## 垃圾邮件

In [14]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [15]:
import math
def spamTest():
    docList=[];classList=[];fullText=[]
    for i in list(range(1,26)):
        wordList = textParse(open('./Data/email/spam/%d.txt' % i,encoding="latin_1").read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./Data/email/ham/%d.txt' % i,encoding="latin_1").read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(len(wordList)));testSet=[]
    for i in list(range(10)):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = [];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print("the error rate is : ", errorCount/len(testSet))
        

In [16]:
spamTest()

the error rate is :  0.5


  return _compile(pattern, flags).split(string, maxsplit)
  This is separate from the ipykernel package so we can avoid doing imports until


## 区域取向

In [17]:
def calcMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
#     print(freqDict)
    sortedFreq = sorted(freqDict.items(),key=lambda x:x[1],reverse=True)
    return sortedFreq[:30]

In [18]:
def localWords(feed1,feed0):
    import feedparser
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in list(range(minLen)):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    print(len(vocabList))
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
            
    # https://www.ranks.nl/stopwords
    with open("./Data/stopwords.txt") as f:
        stopwords = f.readlines()
    stopwords = [x.strip() for x in stopwords]
    for pairW in stopwords:
        if pairW in vocabList: vocabList.remove(pairW)

    print(len(vocabList))
    trainingSet = list(range(2*minLen)); testSet=[]           #create test set
    for i in list(range(20)):
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))
    return vocabList,p0V,p1V

In [19]:
import feedparser
ny = feedparser.parse("https://newyork.craigslist.org/search/sof?format=rss")
sf = feedparser.parse("https://sfbay.craigslist.org/search/sof?format=rss")

In [20]:
vocabList,pSF,pNY = localWords(ny,sf)

690
631
the word: who is not in my Vocabulary!
the word: are is not in my Vocabulary!
the word: are is not in my Vocabulary!
the word: that is not in my Vocabulary!
the word: and is not in my Vocabulary!
the word: other is not in my Vocabulary!
the word: small is not in my Vocabulary!
the word: help is not in my Vocabulary!
the word: our is not in my Vocabulary!
the word: and is not in my Vocabulary!
the word: the is not in my Vocabulary!
the word: and is not in my Vocabulary!
the word: more is not in my Vocabulary!
the word: about is not in my Vocabulary!
the word: you is not in my Vocabulary!
the word: will is not in my Vocabulary!
the word: work is not in my Vocabulary!
the word: with is not in my Vocabulary!
the word: our is not in my Vocabulary!
the word: team is not in my Vocabulary!
the word: are is not in my Vocabulary!
the word: looking is not in my Vocabulary!
the word: for is not in my Vocabulary!
the word: developer is not in my Vocabulary!
the word: who is not in my Vocabu

  return _compile(pattern, flags).split(string, maxsplit)


In [21]:
# https://www.ranks.nl/stopwords
with open("./Data/stopwords.txt") as f:
    stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]


In [22]:
def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    topNY=[]; topSF=[]
    for i in list(range(len(p0V))):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print (item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print (item[0])