# Classifying with probability theory: naive Bayes
by Raziel Lopez Escamilla

Prepare making word vectors from text

In [1]:
from numpy import *

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

In [2]:
def createVocabList(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)


In [3]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList) #create a vector with all 0s
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [4]:
listOPost, listClasses = loadDataSet()
myVocabList = createVocabList(listOPost)
myVocabList

['help',
 'to',
 'mr',
 'is',
 'food',
 'ate',
 'maybe',
 'stupid',
 'dalmation',
 'so',
 'take',
 'please',
 'problems',
 'love',
 'dog',
 'I',
 'posting',
 'flea',
 'park',
 'garbage',
 'cute',
 'quit',
 'licks',
 'buying',
 'him',
 'stop',
 'how',
 'not',
 'has',
 'worthless',
 'steak',
 'my']

In [5]:
setOfWords2Vec(myVocabList, listOPost[0])

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1]

In [6]:
setOfWords2Vec(myVocabList, listOPost[3])

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0]

In [7]:
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = zeros(numWords); p1Num = zeros(numWords)      #initialize probability
    p0Denom = 0.0; p1Denom = 0.0                        
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            print(p1Num)
            p1Num += trainMatrix[i]                    #vector addition
            print(p1Num)
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    print(p1Num) 
    print(p1Denom)
    p1Vect = p1Num/p1Denom          #element-wise division.
    p0Vect = p0Num/p0Denom          #element-wise division.
    return p0Vect,p1Vect,pAbusive

Populate train Matrix

In [8]:
trainMat = []
for postinDoc in listOPost:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
trainMat

[[1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1],
 [0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0],
 [0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0]]

Compute probabilities

In [9]:
p0V, p1V ,pAb = trainNB0(trainMat, listClasses) 

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 1. 2. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 0. 1. 0. 1. 0. 0.]
[0. 1. 0. 0. 0. 0. 1. 2. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 0. 1. 0. 1. 0. 0.]
[0. 1. 0. 0. 1. 0. 1. 3. 0. 0. 1. 0. 0. 0. 2. 0. 1. 0. 1. 1. 0. 1. 0. 1.
 1. 1. 0. 1. 0. 2. 0. 0.]
[0. 1. 0. 0. 1. 0. 1. 3. 0. 0. 1. 0. 0. 0. 2. 0. 1. 0. 1. 1. 0. 1. 0. 1.
 1. 1. 0. 1. 0. 2. 0. 0.]
19.0


In [10]:
pAb

0.5

In [11]:
p0V

array([0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.        , 0.        , 0.04166667, 0.04166667,
       0.        , 0.04166667, 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.        , 0.04166667, 0.        , 0.        ,
       0.04166667, 0.        , 0.04166667, 0.        , 0.08333333,
       0.04166667, 0.04166667, 0.        , 0.04166667, 0.        ,
       0.04166667, 0.125     ])

In [12]:
p1V

array([0.        , 0.05263158, 0.        , 0.        , 0.05263158,
       0.        , 0.05263158, 0.15789474, 0.        , 0.        ,
       0.05263158, 0.        , 0.        , 0.        , 0.10526316,
       0.        , 0.05263158, 0.        , 0.05263158, 0.05263158,
       0.        , 0.05263158, 0.        , 0.05263158, 0.05263158,
       0.05263158, 0.        , 0.05263158, 0.        , 0.10526316,
       0.        , 0.        ])

Lets improve traging by initiallizing occuerences to 1 and denominators to 2, this will prevent from having zero values at the moment of multiplying one probability equal to 0, also to prevent underflows cased by multiplications of small numbers

In [13]:
def trainNB0_imp(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = ones(numWords); p1Num = ones(numWords)      #initialize probability to 1
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]                    #vector addition
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)          #change to log() ,element-wise division.
    p0Vect = log(p0Num/p0Denom)          #change to log() ,element-wise division.
    return p0Vect,p1Vect,pAbusive

build classifier

In [14]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0

Test classifier

In [15]:
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0_imp(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'steak']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))

In [16]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1
['stupid', 'steak'] classified as:  1


Another method that tracks the repetitions of a word

In [None]:
def bagOfwords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet
        returnVec[vocabList.index(word)] += 1
    return returnVec


# Classifiying spam email with naive Bayes

Tokenizing text

In [17]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon'

In [23]:
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

Use regular expressions to erase puntuation from elements

In [38]:
import re
regEx = re.compile('\\W+')
listOfTokens = regEx.split(mySent)
listOfTokens

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [40]:
[tok.lower() for tok in listOfTokens if len(tok) >0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [46]:
emailText = open('email/ham/6.txt').read()
listOftokens = regEx.split(emailText)

In [47]:
listOftokens

['Hello',
 'Since',
 'you',
 'are',
 'an',
 'owner',
 'of',
 'at',
 'least',
 'one',
 'Google',
 'Groups',
 'group',
 'that',
 'uses',
 'the',
 'customized',
 'welcome',
 'message',
 'pages',
 'or',
 'files',
 'we',
 'are',
 'writing',
 'to',
 'inform',
 'you',
 'that',
 'we',
 'will',
 'no',
 'longer',
 'be',
 'supporting',
 'these',
 'features',
 'starting',
 'February',
 '2011',
 'We',
 'made',
 'this',
 'decision',
 'so',
 'that',
 'we',
 'can',
 'focus',
 'on',
 'improving',
 'the',
 'core',
 'functionalities',
 'of',
 'Google',
 'Groups',
 'mailing',
 'lists',
 'and',
 'forum',
 'discussions',
 'Instead',
 'of',
 'these',
 'features',
 'we',
 'encourage',
 'you',
 'to',
 'use',
 'products',
 'that',
 'are',
 'designed',
 'specifically',
 'for',
 'file',
 'storage',
 'and',
 'page',
 'creation',
 'such',
 'as',
 'Google',
 'Docs',
 'and',
 'Google',
 'Sites',
 'For',
 'example',
 'you',
 'can',
 'easily',
 'create',
 'your',
 'pages',
 'on',
 'Google',
 'Sites',
 'and',
 'share',
