In [1]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'], \
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], \
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], \
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'], \
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], \
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    # 1 is abusive, 0 not
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec


def createVocabList(dataSet):
    # create empty set
    vocabSet = set([])
    for document in dataSet:
        # | is union
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):
    # create vector of all 0s
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            # if word in vocabList, set to 1
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [2]:
import numpy as np

In [3]:
def trainNBO(trainMatrix, trainCategory):
    # number of training docs
    numTrainDocs = len(trainMatrix)
    # number of words in each doc
    numWords = len(trainMatrix[0])
    # sum of all abusive docs
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    # create vector of all 0s
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    # denominator
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        # if abusive
        if trainCategory[i] == 1:
            # add vector to p1Num
            p1Num += trainMatrix[i]
            # add number of words in doc to p1Denom
            p1Denom += sum(trainMatrix[i])
        else:
            # add vector to p0Num
            p0Num += trainMatrix[i]
            # add number of words in doc to p0Denom
            p0Denom += sum(trainMatrix[i])
    # element-wise division
    p1Vect = np.log(p1Num / p1Denom)
    p0Vect = np.log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive

In [4]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    # element-wise multiplication
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
    # if p1 > p0
    if p1 > p0:
        return 1
    else:
        return 0


def testingNB():
    listOPosts, listClasses = loadDataSet()
    # create vocabulary list
    myVocabList = createVocabList(listOPosts)
    # create empty list
    trainMat = []
    # for each post in list of posts
    for postinDoc in listOPosts:
        # convert post to vector
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    # train classifier
    p0V, p1V, pAb = trainNBO(np.array(trainMat), np.array(listClasses))
    # test classifier
    testEntry = ['love', 'my', 'dalmation']
    # convert test entry to vector
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    # classify test entry
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    # convert test entry to vector
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    # classify test entry
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))


In [5]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


In [6]:
def bagOfWords2VecMN(vocabList, inputSet):
    # create vector of all 0s
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            # if word in vocabList, increment
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [7]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [8]:
[tok for tok in mySent.split() if len(tok) > 0]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [9]:
[token.lower() for token in mySent.split() if len(token) > 0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm.l.',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [10]:
emailText = open('machinelearninginaction/Ch04/email/ham/6.txt', encoding='gbk').read()
listOfTokens = [token.lower() for token in emailText.split() if len(token) > 0]
listOfTokens

['hello,',
 'since',
 'you',
 'are',
 'an',
 'owner',
 'of',
 'at',
 'least',
 'one',
 'google',
 'groups',
 'group',
 'that',
 'uses',
 'the',
 'customized',
 'welcome',
 'message,',
 'pages',
 'or',
 'files,',
 'we',
 'are',
 'writing',
 'to',
 'inform',
 'you',
 'that',
 'we',
 'will',
 'no',
 'longer',
 'be',
 'supporting',
 'these',
 'features',
 'starting',
 'february',
 '2011.',
 'we',
 'made',
 'this',
 'decision',
 'so',
 'that',
 'we',
 'can',
 'focus',
 'on',
 'improving',
 'the',
 'core',
 'functionalities',
 'of',
 'google',
 'groups',
 '--',
 'mailing',
 'lists',
 'and',
 'forum',
 'discussions.',
 'instead',
 'of',
 'these',
 'features,',
 'we',
 'encourage',
 'you',
 'to',
 'use',
 'products',
 'that',
 'are',
 'designed',
 'specifically',
 'for',
 'file',
 'storage',
 'and',
 'page',
 'creation,',
 'such',
 'as',
 'google',
 'docs',
 'and',
 'google',
 'sites.']

In [11]:
def textParse(bigString):  # 去掉少于两个字符的字符串，并将所有字符串转换为小写
    import re
    #print(bigString)
    #listOfTokens = re.split(r'\W*', bigString)
    print(listOfTokens)
    #print([token.lower() for token in listOfTokens if len(token) > 2])
    return [token.lower() for token in bigString.split() if len(token) > 2]
docList = []
classList = []
fullText = []
wordList = textParse(open('machinelearninginaction/Ch04/email/spam/1.txt') .read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('machinelearninginaction/Ch04/email/ham/1.txt' ).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
classList

['hello,', 'since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'google', 'groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message,', 'pages', 'or', 'files,', 'we', 'are', 'writing', 'to', 'inform', 'you', 'that', 'we', 'will', 'no', 'longer', 'be', 'supporting', 'these', 'features', 'starting', 'february', '2011.', 'we', 'made', 'this', 'decision', 'so', 'that', 'we', 'can', 'focus', 'on', 'improving', 'the', 'core', 'functionalities', 'of', 'google', 'groups', '--', 'mailing', 'lists', 'and', 'forum', 'discussions.', 'instead', 'of', 'these', 'features,', 'we', 'encourage', 'you', 'to', 'use', 'products', 'that', 'are', 'designed', 'specifically', 'for', 'file', 'storage', 'and', 'page', 'creation,', 'such', 'as', 'google', 'docs', 'and', 'google', 'sites.']
['hello,', 'since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'google', 'groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message,', 'pages', 'or', 'files,', 'we', 

[1, 0]

In [12]:
wordList = textParse(open('machinelearninginaction/Ch04/email/spam/2.txt') .read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('machinelearninginaction/Ch04/email/ham/2.txt' ).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
classList[1]

['hello,', 'since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'google', 'groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message,', 'pages', 'or', 'files,', 'we', 'are', 'writing', 'to', 'inform', 'you', 'that', 'we', 'will', 'no', 'longer', 'be', 'supporting', 'these', 'features', 'starting', 'february', '2011.', 'we', 'made', 'this', 'decision', 'so', 'that', 'we', 'can', 'focus', 'on', 'improving', 'the', 'core', 'functionalities', 'of', 'google', 'groups', '--', 'mailing', 'lists', 'and', 'forum', 'discussions.', 'instead', 'of', 'these', 'features,', 'we', 'encourage', 'you', 'to', 'use', 'products', 'that', 'are', 'designed', 'specifically', 'for', 'file', 'storage', 'and', 'page', 'creation,', 'such', 'as', 'google', 'docs', 'and', 'google', 'sites.']
['hello,', 'since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'google', 'groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message,', 'pages', 'or', 'files,', 'we', 

0

In [13]:
'''def textParse(bigString):  # 去掉少于两个字符的字符串，并将所有字符串转换为小写
    import re
    #print(bigString)
    #listOfTokens = re.split(r'\W*', bigString)
    print(listOfTokens)
    #print([token.lower() for token in listOfTokens if len(token) > 2])
    return [token.lower() for token in bigString.split() if len(token) > 2]'''    
'''
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(17,18):
        print(i)
        wordList = textParse(open('machinelearninginaction/Ch04/email/spam/%d.txt' % i).read())
        #print(wordList)
        # add to doc list
        docList.append(wordList)
        # add to full text
        fullText.extend(wordList)
        # add to class list
        classList.append(1)
        # read email
        wordList = textParse(open('machinelearninginaction/Ch04/email/ham/%d.txt' % i).read())
        # add to doc list
        docList.append(wordList)
        # add to full text
        fullText.extend(wordList)
        # add to class list
        classList.append(0)
    # create vocabulary list
    vocabList = createVocabList(docList)
    # create empty list
    trainingSet = list(range(50))
    # create empty list
    testSet = []
    # for 10 times
    for i in range(10):
        # choose random index
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        # add to test set
        testSet.append(trainingSet[randIndex])
        # delete from training set
        del (trainingSet[randIndex])
    # create empty list
    trainMat = []
    # create empty list
    trainClasses = []
    # for each index in training set
    for docIndex in trainingSet:
        # convert doc to vector
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        # add to train classes
        trainClasses.append(classList[docIndex])
    # train classifier
    p0V, p1V, pSpam = trainNBO(np.array(trainMat), np.array(trainClasses))
    # create empty list
    errorCount = 0
    # for each index in test set
    for docIndex in testSet:
        # convert doc to vector
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        # classify doc
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            # increment error count
            errorCount += 1
            # print email text
    print("classification error", docList[docIndex])
    print(docList)
'''    
# spamTest()

'\ndef spamTest():\n    docList = []\n    classList = []\n    fullText = []\n    for i in range(17,18):\n        print(i)\n        wordList = textParse(open(\'machinelearninginaction/Ch04/email/spam/%d.txt\' % i).read())\n        #print(wordList)\n        # add to doc list\n        docList.append(wordList)\n        # add to full text\n        fullText.extend(wordList)\n        # add to class list\n        classList.append(1)\n        # read email\n        wordList = textParse(open(\'machinelearninginaction/Ch04/email/ham/%d.txt\' % i).read())\n        # add to doc list\n        docList.append(wordList)\n        # add to full text\n        fullText.extend(wordList)\n        # add to class list\n        classList.append(0)\n    # create vocabulary list\n    vocabList = createVocabList(docList)\n    # create empty list\n    trainingSet = list(range(50))\n    # create empty list\n    testSet = []\n    # for 10 times\n    for i in range(10):\n        # choose random index\n        randIndex

In [14]:
import feedparser
ny = feedparser.parse('http://www.nasa.gov/rss/dyn/image_of_the_day.rss')
len(ny['entries'])


60

In [19]:
def textParse(bigString):  # 去掉少于两个字符的字符串，并将所有字符串转换为小写
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [token.lower() for token in listOfTokens if len(token) > 2]

    

def calcMostFreq(vocabList, fullText):
    # create empty dictionary
    freqDict = {}
    # for each token in vocab list
    for token in vocabList:
        # add to dictionary
        freqDict[token] = fullText.count(token)
    # create empty list
    sortedFreq = sorted(freqDict.items(), key=lambda x: x[1], reverse=True)
    # return top 30 words
    return sortedFreq[:30]

def localWords(feed1, feed0):
    import feedparser
    # create empty list
    docList = []
    # create empty list
    classList = []
    # create empty list
    fullText = []
    # minimum length
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    # for each entry in feed 1
    for i in range(minLen):
        # get entry
        wordList = textParse(feed1['entries'][i]['summary'])
        # add to doc list
        docList.append(wordList)
        # add to full text
        fullText.extend(wordList)
        # add to class list
        classList.append(1)
        # get entry
        wordList = textParse(feed0['entries'][i]['summary'])
        # add to doc list
        docList.append(wordList)
        # add to full text
        fullText.extend(wordList)
        # add to class list
        classList.append(0)
    # create vocabulary list
    vocabList = createVocabList(docList)
    # get top 30 words
    top30Words = calcMostFreq(vocabList, fullText)
    # for each word in top 30 words
    for pairW in top30Words:
        # if word in vocab list
        if pairW[0] in vocabList:
            # remove from vocab list
            vocabList.remove(pairW[0])
    # create empty list
    trainingSet = list(range(2 * minLen))
    # create empty list
    testSet = []
    # for 20 times
    for i in range(20):
        # choose random index
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        # add to test set
        testSet.append(trainingSet[randIndex])
        # delete from training set
        del (trainingSet[randIndex])
    # create empty list
    trainMat = []
    # create empty list
    trainClasses = []
    # for each index in training set
    for docIndex in trainingSet:
        # convert doc to vector
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        # add to train classes
        trainClasses.append(classList[docIndex])
    # train classifier
    p0V, p1V, pSpam = trainNBO(np.array(trainMat), np.array(trainClasses))
    # create empty list
    errorCount = 0
    # for each index in test set
    for docIndex in testSet:
        # convert doc to vector
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        # classify doc
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            # increment error count
            errorCount += 1
    # print error rate
    print('the error rate is: ', float(errorCount) / len(testSet))
    # return vocab list, p0v, and p1v
    return vocabList, p0V, p1V
            
    

In [20]:
ny = feedparser.parse('http://www.nasa.gov/rss/dyn/image_of_the_day.rss')
sf = feedparser.parse('http://sports.yahoo.com/nba/teams/hou/rss.xml')
vocabList, pSF, pNY = localWords(ny, sf)
#vocabList, pSF, pNY = localWords(ny, sf)

IndexError: list index out of range