### 使用python进行文本分类

In [1]:
# 准备数据 词表到向量的转换函数
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                  ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                  ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]
    return postingList,classVec

将数据集都统一合并成一个集成，组成一个词表

In [2]:
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

在以上构建的一个小的数据集中，可以将文本集构成一个词向量，来判断给定的一个文档在词库中的词向量

In [3]:
def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 
        else:
            print 'the word %s is not in my vocabulary! '% word
    return returnVec

可以简单的运行看一下结果

In [4]:
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
myVocabList

['cute',
 'love',
 'help',
 'garbage',
 'quit',
 'I',
 'problems',
 'is',
 'park',
 'stop',
 'flea',
 'dalmation',
 'licks',
 'food',
 'not',
 'him',
 'buying',
 'posting',
 'has',
 'worthless',
 'ate',
 'to',
 'maybe',
 'please',
 'dog',
 'how',
 'stupid',
 'so',
 'take',
 'mr',
 'steak',
 'my']

In [5]:
setOfWords2Vec(myVocabList,listOPosts[0])

[0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

In [6]:
setOfWords2Vec(myVocabList,listOPosts[1])

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0]

朴素贝叶斯分类器的训练函数  
采用概率论的知识，计算单词在（侮辱性或非侮辱性文档）中的概率

In [7]:
from numpy import *

# 目前文档分类就是 侮辱性或非侮辱性
def trainNB0(trainMatrix,trainLabels):
    # 计算文档数
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainLabels)/float(numTrainDocs)
    p0Num = ones(numWords)
    p1Num = ones(numWords)
    
    p0Denom = 2.0
    p1Denom = 2.0
    
    for i in range(numTrainDocs):
        if trainLabels[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else: 
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
            
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect,p1Vect,pAbusive

In [8]:
trainMat = []
for postiingDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postiingDoc))
trainMat

[[0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0],
 [1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0]]

In [9]:
p0v,p1v,pAb = trainNB0(trainMat,listClasses)

In [10]:
pAb

0.5

In [11]:
p0v

array([ 0.07692308,  0.07692308,  0.07692308,  0.03846154,  0.03846154,
        0.07692308,  0.07692308,  0.07692308,  0.03846154,  0.07692308,
        0.07692308,  0.07692308,  0.07692308,  0.03846154,  0.03846154,
        0.11538462,  0.03846154,  0.03846154,  0.07692308,  0.03846154,
        0.07692308,  0.07692308,  0.03846154,  0.07692308,  0.07692308,
        0.07692308,  0.03846154,  0.07692308,  0.03846154,  0.07692308,
        0.07692308,  0.15384615])

In [12]:
p1v

array([ 0.04761905,  0.04761905,  0.04761905,  0.0952381 ,  0.0952381 ,
        0.04761905,  0.04761905,  0.04761905,  0.0952381 ,  0.0952381 ,
        0.04761905,  0.04761905,  0.04761905,  0.0952381 ,  0.0952381 ,
        0.0952381 ,  0.0952381 ,  0.0952381 ,  0.04761905,  0.14285714,
        0.04761905,  0.0952381 ,  0.0952381 ,  0.04761905,  0.14285714,
        0.04761905,  0.19047619,  0.04761905,  0.0952381 ,  0.04761905,
        0.04761905,  0.04761905])

朴素贝叶斯分类函数

In [13]:
# 由上面计算得出的概率可以进行文本的类别对比
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify*p1Vec) + log(pClass1)
    p0 = sum(vec2Classify*p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

In [14]:
# 构建一个封装上面所有的操作
def testingNB():
    # 给出词以及标签
    listOPosts,listClasses = loadDataSet()
    # 构建词库
    myVocabList = createVocabList(listOPosts)
    
    trainMat = []
    # 将文本转化为词向量
    for postingDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postiingDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love','my','dlmation']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print testEntry,'classified as :',classifyNB(thisDoc,p0v,p1v,pAb)
    
    testEntry = ['stupid','garbage']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print testEntry,'classified as :',classifyNB(thisDoc,p0v,p1v,pAb)

In [15]:
testingNB()

the word dlmation is not in my vocabulary! 
['love', 'my', 'dlmation'] classified as : 0
['stupid', 'garbage'] classified as : 1


以上进行分类的时候，我们将每个词出现与否作为一个特征，这可以被描述为词集模型（set-of-words）  
如果一个词在文档中不止出现一次，那就意味着包含该词是否出现在文档中不能表达的某种信息，这种方法被称为词袋模型


In [16]:
def bagOfWords2Vec(vacabList,inputSet):
    returnVec = [0]*len(vacabList)
    for word in inputSet:
        if word in vacabList:
            returnVec[vacabList.index(word)] += 1
    return returnVec

切分文本  
对于一个文本字符串，可以使用python的string.split()方法将其切分

In [17]:
mySent = 'This book is the best book on python or M.L. I have ever laid eyes upon.'
words = mySent.split()
words

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

可以看到，简单的分割还是有其他的标点符号  
可以使用正则表达式来切分句子，其中分隔符是除单词，数字外的任意字符串

In [18]:
import re

regEx = re.compile('\\W*')
listOfTakens = regEx.split(mySent)
listOfTakens

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

In [19]:
[tok for tok in listOfTakens if len(tok) >0]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

### 测试算法：使用朴素贝叶斯进行交叉验证

In [20]:
# 文本解析及完整的垃圾邮件测试函数
def textParse(bigString):
    listOfaTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfaTokens if len(tok) >2]

In [21]:
def spamTest():
    # 邮件文档集合列表
    docList = []
    # 分类标签
    classList = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(0)
        
    # 构建词汇表
    vocabList = createVocabList(docList)
    
    trainingSet = range(50)
    testSet = []
    
    # 挑选10条记录
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
        
    # 开始构建训练词向量矩阵
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
        
    # 对训练集进行概率计算
    p0v,p1v,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0v,p1v,pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is :',float(errorCount)/len(testSet)

In [22]:
spamTest()

the error rate is : 0.3


### 使用朴素贝叶斯分类器从个人广告中获取区域倾向

In [23]:
# 收集数据
import feedparser
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')

In [24]:
ny['entries']

[{'dc_source': u'http://newyork.craigslist.org/wch/stp/6255003153.html',
  'dc_type': u'text',
  'id': u'http://newyork.craigslist.org/wch/stp/6255003153.html',
  'language': u'en-us',
  'link': u'http://newyork.craigslist.org/wch/stp/6255003153.html',
  'links': [{'href': u'http://newyork.craigslist.org/wch/stp/6255003153.html',
    'rel': u'alternate',
    'type': u'text/html'}],
  'published': u'2017-08-08T08:23:37-04:00',
  'published_parsed': time.struct_time(tm_year=2017, tm_mon=8, tm_mday=8, tm_hour=12, tm_min=23, tm_sec=37, tm_wday=1, tm_yday=220, tm_isdst=0),
  'rights': u'copyright 2017 craiglist',
  'rights_detail': {'base': u'https://newyork.craigslist.org/search/stp?format=rss',
   'language': None,
   'type': u'text/plain',
   'value': u'copyright 2017 craiglist'},
  'summary': u"hi \nI've started learning sign language online and would love to find someone to converse with. \nany age, sex, skill level, hearing or deaf, it's all good. \nthanks \nm4m \nm4mw \nasl",
  'summ

In [25]:
len(ny['entries'])

25

In [26]:
import operator
# RSS源分类器及高频词去除函数
# 计算高频函数
def calcMostFreq(vocabList,fullText):
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]

In [27]:
def localWords(feed1,feed0):
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)
    for itemWord in top30Words:
        if itemWord in vocabList:
            vocabList.remove(itemWord)
    
    trainingSet = range(2*minLen)
    testSet = []
    
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
        
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
        
    p0v,p1v,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0.0
    for docIndex in testSet:
        wordVector = bagOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0v,p1v,pSpam) != classList[docIndex]:
            errorCount +=1
    print 'the error rate is ',float(errorCount)/len(testSet)
    return vocabList,p0v,p1v

In [28]:
ny = feedparser.parse('http://newyork.craiglist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craiglist.org/stp/index.rss')
print len(ny['entries'])
print len(sf['entries'])


25
25


In [29]:
vocabList,pSF,pNY = localWords(ny,sf)
pSF

the error rate is  0.6


array([ 0.00239234,  0.00717703,  0.00239234,  0.00239234,  0.00478469,
        0.00239234,  0.00478469,  0.00956938,  0.00478469,  0.00239234,
        0.00239234,  0.00239234,  0.00717703,  0.00239234,  0.00239234,
        0.00239234,  0.00239234,  0.00239234,  0.00478469,  0.00478469,
        0.00239234,  0.00239234,  0.00717703,  0.00478469,  0.00478469,
        0.00239234,  0.00478469,  0.00239234,  0.00239234,  0.00239234,
        0.00239234,  0.00478469,  0.00717703,  0.00478469,  0.00239234,
        0.00239234,  0.01196172,  0.00239234,  0.00478469,  0.00239234,
        0.00239234,  0.00239234,  0.00239234,  0.00478469,  0.00478469,
        0.00478469,  0.00239234,  0.00478469,  0.00956938,  0.00478469,
        0.00478469,  0.00717703,  0.00239234,  0.00239234,  0.00478469,
        0.00239234,  0.00239234,  0.00239234,  0.00478469,  0.00239234,
        0.00478469,  0.00239234,  0.00478469,  0.00239234,  0.00956938,
        0.00239234,  0.00239234,  0.00239234,  0.00239234,  0.00

### 分析数据，显示地域相关的用词


In [30]:
def gettTopWords(ny,sf):
    vocabList,p0v,p1v = localWords(ny,sf)
    topNY = []
    topSF = []
    for i in range(len(p0v)):
        if p0v[i] > 0.009:
            topSF.append((vocabList[i],p0v[i]))
        if p1v[i] > 0.009:
            topNY.append((vocabList[i],p1v[i]))
            
    sortedSF = sorted(topSF,key=lambda pair:pair[1],reverse=True)
    print 'SFSFSFSFSFSFSFSFSFSFSF'
    for item in sortedSF:
        print item[0]
        
    sortedNY = sorted(topNY,key=lambda pair:pair[1],reverse=True)
    print 'NYNYNYNYNYNYNYNYNYNYNY'
    for item in sortedNY:
        print item[0]

In [31]:
gettTopWords(ny,sf)

the error rate is  0.7
SFSFSFSFSFSFSFSFSFSFSF
and
for
you
not
looking
have
any
just
with
can
the
lol
night
your
but
friends
NYNYNYNYNYNYNYNYNYNYNY
and
with
you
the
looking
for
your
would
some
out
male
someone
this
can
any
don
only
please
single
age
