In [1]:
import numpy as np

In [2]:
# 准备数据：词表到向量的转换函数
# 创建实验样本
def loadDataSet():
    # 创建样本，每一行为每句话进行分词后的集合
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    # 类别标签列表，1 代表侮辱性, 0 代表非侮辱性
    classVec = [0,1,0,1,0,1]
    return postingList,classVec

In [3]:
def creatVocabList(dataSet):
    # 创建词汇表(包含所有出现的单词且无重复的集合),初始为空
    vocabSet = set([])
    # 遍历数据集的每个实例
    for document in dataSet:
        # 求并集,即所有出现的单词
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [4]:
def setOfWords2Vec(vocabList, inputSet):
    '''
    输入为词汇表和一个实例向量(即一个分词后的句子向量)
    '''
    # 创造一个长度和词汇表长度相同的句子特征向量
    returnVec = np.zeros((len(vocabList)))
    # 对该句子向量，如果某个存在于词汇表中的词出现了，
    # 就将句子特征向量中该词在词汇表中对应的索引位置的值设为1
    for word in inputSet:
        if(word in vocabList):
            returnVec[vocabList.index(word)] = 1
        else:
            print("The word %s is not in my Vocabulary!" % word)
    return returnVec

In [5]:
listOPosts, listClasses = loadDataSet()
print('listOposts');print(listOPosts)
print('listClasses');print(listClasses)
myVocabList = creatVocabList(listOPosts)
print(myVocabList)

listOposts
[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
listClasses
[0, 1, 0, 1, 0, 1]
['stop', 'help', 'park', 'dog', 'quit', 'please', 'stupid', 'posting', 'garbage', 'my', 'him', 'dalmation', 'steak', 'food', 'flea', 'ate', 'mr', 'problems', 'cute', 'take', 'maybe', 'has', 'love', 'not', 'licks', 'worthless', 'to', 'is', 'buying', 'I', 'so', 'how']


In [6]:
setOfWords2Vec(myVocabList, listOPosts[0])

array([0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [7]:
def trainNB0(trainMatrix, trainCategory):
    '''
    trainMatrix 为文档集合矩阵(含n个文档,即n个单词向量)
    trainCategory 为文档集合中的类别标签向量(含n个标签)
    p(ci|w) = p(w|ci) * p(ci) / p(w)
    函数返回 p(w|c1), p(w|c0),p(c1)
    c1为侮辱性类别,c0为非侮辱性类别,w代表单词
    '''
    # numTrainDocs 为文档个数
    numTrainDocs = len(trainMatrix)
    # numWords 为一个文档的长度,也即单词表长度
    numWords = len(trainMatrix[0])
    # 出现侮辱性文档的概率 p(c1) = 所有文档中类别为1的比例
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    # 初始化概率
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    # 遍历所有文档
    for i in range(numTrainDocs):
        # 如果该文档是侮辱性
        if(trainCategory[i] == 1):
            # 统计所有类别为1的单词向量中各个单词出现的次数
            # 若单词出现则在单词向量的位置为1,在p1Num该单词的位置 +1
            p1Num += trainMatrix[i]
            # 统计类别为1所有文档中出现单词的数目
            p1Denom += np.sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += np.sum(trainMatrix[i])
    # 计算p(wi|c1)
    p1Vect = np.log(p1Num / p1Denom)
    # 计算p(wi|c0)
    p0Vect = np.log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive

In [8]:
trainMat = []
# 将文档集合从单词转变为one-hot编码
for posinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, posinDoc))
print(trainMat)

[array([0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0.]), array([1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]), array([1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.]), array([0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.])]


In [9]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [10]:
print(p0V)
print(p1V)
print(pAb)

[-2.56494936 -2.56494936 -3.25809654 -2.56494936 -3.25809654 -2.56494936
 -3.25809654 -3.25809654 -3.25809654 -1.87180218 -2.15948425 -2.56494936
 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -2.56494936
 -2.56494936 -3.25809654 -3.25809654 -2.56494936 -2.56494936 -3.25809654
 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -2.56494936 -2.56494936]
[-2.35137526 -3.04452244 -2.35137526 -1.94591015 -2.35137526 -3.04452244
 -1.65822808 -2.35137526 -2.35137526 -3.04452244 -2.35137526 -3.04452244
 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -3.04452244
 -3.04452244 -2.35137526 -2.35137526 -3.04452244 -3.04452244 -2.35137526
 -3.04452244 -1.94591015 -2.35137526 -3.04452244 -2.35137526 -3.04452244
 -3.04452244 -3.04452244]
0.5


In [15]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    '''
    vec2Classify是一个one-hot编码的文档,p0Vec和p1Vec是长度为单词表的向量,
    p1Vec每个位置上的值为单词表中该位置的单词在类别为c1的文档中出现的概率
    已知p(c1),p(wi|c1),则p(c1|wi) = p(wi|c1) * p(c1),用log将乘法转化为加法
    左式忽略了p(wi),具体推导见《统计学习方法》第48页
    '''
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if(p1 > p0):
        return 1
    else:
        return 0

In [20]:
def testingNB():
    listOposts, listClasses = loadDataSet()
    myVocabList = creatVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love','my','dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,end=' ')
    print('classified as:',end=' ')
    print(classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid','garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,end=' ')
    print('classified as:',end=' ')
    print(classifyNB(thisDoc,p0V,p1V,pAb))

In [21]:
testingNB()

['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as: 1


In [22]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = np.zeros(len(vocabList))
    for word in inputSet:
        if(word) in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [23]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [24]:
import re
regEx = re.compile('\\W*')
listOfTokens = regEx.split(mySent)
print(listOfTokens)

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']


  This is separate from the ipykernel package so we can avoid doing imports until
