In [1]:
import numpy as np
import feedparser
import operator

## 词转化为向量

In [2]:
# 创建实验样本
def loadDataSet():
    # 词条集合
    postingList= [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\
                  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\
                  ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\
                  ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\
                  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\
                  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    # 标签集合
    classVec = [0, 1, 0, 1, 0, 1]    # 1-侮辱性文字，0-正常言论
    return postingList, classVec

# 创建所有词的列表
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)    # 并集
    return list(vocabSet)

# 判断 vocabList 中的单词在 inputSet 中是否出现
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)    # 设置与词汇表等长的0向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print('the word: %s is not in my Vocabulary!' % word)
    return returnVec

# 词袋模型
# 判断 vocabList 中的单词在 inputSet 中是否出现
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)    # 设置与词汇表等长的0向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        #else:
        #    print('the word: %s is not in my Vocabulary!' % word)
    return returnVec

## 文件解析

In [3]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]    # 去除长度太短的无效字符串，取消首字母大写

## 朴素贝叶斯分类器训练函数
伪代码如下：  
计算每个类别中文档数目  
对每篇文章训练文档：  
&ensp;&ensp;&ensp;&ensp;对每个类别：  
&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;如果词条出现在文档中——>增加该词条计数值  
&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;增加所有词条计数值  
对每个类别：  
&ensp;&ensp;&ensp;&ensp;对每个词条：  
&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;将该词条的数目除以总词条的数目的到条件概率  
返回每个类别的条件概率

In [4]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)    # 计算侮辱性文章的概率
    # 初始化概率, 防止概率为0，设置词初试次数为1，分母为加上类别数量，这里加2
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):    # 遍历文档
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]    # 侮辱性文章包含的词汇加1
            p1Denom += sum(trainMatrix[i])    # 总词数加1
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)    # 除以类别总词数, 为了防止数字过小下溢，使用log函数
    p0Vect = np.log(p0Num/p0Denom)
    return p0Vect, p1Vect, pAbusive

## 朴素贝叶斯分类函数

In [5]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    # 转换为log之后原先的乘法变加法,条件独立的乘法变sum
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

### 找到出现次数最多的30个词

In [6]:
def calcMostFreq(vocabList, fullText):
    freqDict = {}
    # 统计所有词出现的频次
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    # 根据出现次数从高到低排序词
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
    return sortedFreq[:30]

## RSS分类器

In [7]:
def localWords(feed1, feed0):
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    # 访问RSS，存储数据
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    # 移除出现最多的30个词
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen))
    testSet = []
    # 随机选择5个做测试集
    for i in range(5):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        # 构建词向量，加入到测试集
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    # 计算概率
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    # 遍历测试集
    for docIndex in testSet:
        # 构建词向量
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        # 分类并检测是否正确
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
    return vocabList, p0V, p1V

## 返回高于某个阈值的所有词

In [8]:
def getTopWords(ny, sf):
    vocabList, p0V, p1V = localWords(ny, sf)
    topNY = []
    topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0:
            topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0:
            topNY.append((vocabList[i], p0V[i]))
    sortedSF = sorted(topSF, key=lambda pair:pair[1], reverse=True)
    print('SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF')
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair:pair[1], reverse=True)
    print('NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY*NY')
    for item in sortedNY:
        print(item[0])

In [9]:
ny = feedparser.parse('https://newyork.craigslist.org/search/res?format=rss')
sf = feedparser.parse('https://sfbay.craigslist.org/search/apa?format=rss')

In [11]:
getTopWords(ny, sf)

the error rate is:  0.2
SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF*SF
features
unit
easy
located
two
vineyard
santa
level
location
beautiful
luxury
family
house
floor
building
block
city
san
deck
that
park
remodeled
modern
ave
bay
clara
access
casa
hill
walking
walk
light
offers
com
steps
http
cupertino
viva
make
full
downtown
gorgeous
contact
call
school
schedule
tour
set
living
heights
today
bustling
visit
completely
between
perfectly
dining
sands
sahara
well
market
napa
just
country
broadway
when
bed
single
backyard
distance
now
ample
counter
bart
bright
starbird
wood
lot
hardwood
renovated
built
architecture
wine
floors
hyde
jose
station
close
overlooking
baxter
updated
fully
has
francisco
big
beach
pacific
space
bedrooms
2313
jackson
available
furnished
neighborhood
open
halfway
bathroom
quiet
cpmc
studio
university
2pm
noe
deserve
feel
webster
stunning
capitola
berkeley
entertaining
stay
luxurious
loaded
tastefully
fireplace
beat
cottage
sedgwick
next
heart
energetic
c

  return _compile(pattern, flags).split(string, maxsplit)
