### 解决此类问题的一般步骤
1. 收集数据：提供文本文件
2. 准备数据：将文本文件解析成词条向量。
3. 分析数据：检查词条确保解析的正确性。
4. 训练算法：使用下面的trainNB()函数
5. 测试算法：使用classify()，并且构建一个新的测试函数来计算文档集的错误率。
6. 使用算法：构建一个完整的程序对一组文档进行分类，将错分的文档输出到屏幕上。

In [18]:
import pandas as pd

raw_data = pd.read_csv('./asset/data.txt', sep='\t')
raw_data

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...


In [3]:
# ListClasses是对应的训练内容的分类
ListClasses = []
for index in range(len(raw_data)):
    ListClasses.append(raw_data.iloc[index].category)

ListClasses

['ham', 'ham', 'spam', 'ham', 'ham', 'spam']

In [17]:
def tokenize(sms):
    return sms.split(' ')

# ListPost是训练内容
ListPost = []
# ListClasses是对应的训练内容的分类
ListClasses = []
for content in range(len(raw_data)):
    ListPost.append(tokenize(raw_data.iloc[content].text))
    # spam是1，ham是0
    if raw_data.iloc[content].category == 'ham':
        ListClasses.append(0)
    else:
        ListClasses.append(1)
    
print(ListClasses)


# 创建一个包含所有文档中出现的不重复单词的列表, dataset为训练文本
def createVocabList(dataset):
    vocabSet = set([])
    for document in dataset:
#         print(document)
        vocabSet = vocabSet | set(document) # 两个set取并集
#         print(vocabSet)
    return list(vocabSet)
myVocabList = createVocabList(ListPost)
print(len(myVocabList))

# 词袋模型，每个词可以出现多次
def bagOfWords2VecMN(vocabList, inputSet):
    Vector = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            Vector[vocabList.index(word)]+=1
    return Vector



[0, 0, 1, 0, 0, 1]
99


In [12]:
def trainNB(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)  # 总文档数
    numWords = len(trainMatrix[0]) # Vocabulary中有多少个 words
    # p(ci) = 类ci下的文档数 / 总文档数
    pSpam = sum(trainCategory)/float(numTrainDocs)  # 这里是p(c1) 垃圾邮件数 / 总邮件数
    p0Num = [0 for i in range(numWords)]  # 产生长度为 numWords 的全0 lis
#     print(p0Num)
    p1Num = [0 for i in range(numWords)]
    p0Denom, p1Denom = 0.0,0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            subMatrix = trainMatrix[i]
            for a in range(numWords):
                summ = p1Num[a]+subMatrix[a]
                p1Num[a] = summ
#             print(subMatrix)
#             print(p1Num)
            p1Denom += sum(trainMatrix[i])  # 类c1下，单词的总数
        else:
            subMatrix = trainMatrix[i]
            for a in range(numWords):
                summ = p0Num[a]+subMatrix[a]
                p0Num[a] = summ
#             p0Num += trainMatrix[i]  # 计数，
#             print(subMatrix)
#             print(p0Num)
            p0Denom += sum(trainMatrix[i])
    p1Vector = []
    p0Vector = []
#     print(p0Num)
#     print(p0Denom)
    a = 1  # Laplace smoothing
    for i in range(numWords):
        p1 = (p1Num[i] + a) / (p1Denom + a*numWords)
        p0 = (p0Num[i] + a) / (p0Denom + a*numWords)
#         p1 = p1Num[i]/p1Denom
#         p0 = p0Num[i]/p0Denom
        p1Vector.append(p1)
        p0Vector.append(p0)
#     p1Vector = p1Num/p1Denom
#     p0Vector = p0Num/p0Denom
#     a = 1
#     p1Vector = (p1Num + a) / (p1Denom + a*numWords)
#     p0Vector = (p0Num + a) / (p0Denom + a*numWords)
    return p0Vector,p1Vector,pSpam

trainMat = []
for eachDoc in ListPost:
    trainMat.append(bagOfWords2VecMN(myVocabList,eachDoc))
# len(trainMat)
trainNB(trainMat,ListClasses)

([0.006711409395973154,
  0.006711409395973154,
  0.006711409395973154,
  0.013422818791946308,
  0.006711409395973154,
  0.006711409395973154,
  0.013422818791946308,
  0.006711409395973154,
  0.020134228187919462,
  0.006711409395973154,
  0.013422818791946308,
  0.013422818791946308,
  0.013422818791946308,
  0.006711409395973154,
  0.006711409395973154,
  0.006711409395973154,
  0.006711409395973154,
  0.013422818791946308,
  0.006711409395973154,
  0.013422818791946308,
  0.013422818791946308,
  0.013422818791946308,
  0.006711409395973154,
  0.013422818791946308,
  0.013422818791946308,
  0.013422818791946308,
  0.013422818791946308,
  0.013422818791946308,
  0.006711409395973154,
  0.006711409395973154,
  0.006711409395973154,
  0.006711409395973154,
  0.013422818791946308,
  0.006711409395973154,
  0.006711409395973154,
  0.006711409395973154,
  0.013422818791946308,
  0.013422818791946308,
  0.013422818791946308,
  0.006711409395973154,
  0.006711409395973154,
  0.006711409395

In [14]:
p0V,p1V,pSpam = trainNB(trainMat,ListClasses)
pSpam

0.3333333333333333

In [8]:
def classify(testEntry):
    p0 = 1
    p1 = 1
    for i in range(len(testEntry)):
        if testEntry[i] in myVocabList:
    #         print(myVocabList.index(testEntry[i]))
            p0i = p0V[myVocabList.index(testEntry[i])]
            p0 = p0*p0i
            p1i = p1V[myVocabList.index(testEntry[i])]
            p1 = p1*p1i
    p0 = p0*(1-pSpam)
    p1 = p1*pSpam
    # print(p0)
    print(p1/p0)
    return p1/p0

In [15]:
sms = 'I am not spam'

testEntry = tokenize(sms)
a = classify(testEntry)

0.2342767295597484
