# 朴素贝叶斯分类器

作者：周阳

主要任务：

* 分类文件整合
* 基于朴素贝叶斯的概率统计
  - 多项式构建
  - 伯努力构建
  - 混合模型构建
* 每个类别构建词表（20个log词表）
* 通过查表完成测试分类

## 通过文件夹分类文件并且整合

In [1]:
import random as ran
import os
import numpy as np

##### 进度条

In [2]:
import sys

class ProgressBar:
    def __init__(self, count = 0, total = 0, width = 30):
        self.count = count
        self.total = total
        self.width = width
        print('-------------------------------Progressing----------------------------------')
    def move(self):
        self.count += 1
    def log(self):
        sys.stdout.write(' ' * (self.width + 9) + '\r')
        sys.stdout.flush()
        progress = self.width * self.count / self.total
        sys.stdout.write('{0:3}/{1:3} with {2} finished: '.format(self.count, self.total ,str(self.count/self.total*100)[:4]+'%'))
        sys.stdout.write('#' * int(progress) + '-' * int(self.width - progress) + '\r')
        if progress == self.width:
            sys.stdout.write('\n')
        sys.stdout.flush()

In [3]:
classis = {0:'/home/blueberry/data/datamining/20news-18828/alt.atheism',
           1:'/home/blueberry/data/datamining/20news-18828/comp.graphics',
           2:'/home/blueberry/data/datamining/20news-18828/comp.os.ms-windows.misc',
           3:'/home/blueberry/data/datamining/20news-18828/comp.sys.ibm.pc.hardware',
           
           4:'/home/blueberry/data/datamining/20news-18828/comp.sys.mac.hardware',
           5:'/home/blueberry/data/datamining/20news-18828/comp.windows.x',
           6:'/home/blueberry/data/datamining/20news-18828/misc.forsale',
           7:'/home/blueberry/data/datamining/20news-18828/rec.autos',
           
           8:'/home/blueberry/data/datamining/20news-18828/rec.motorcycles',
           9:'/home/blueberry/data/datamining/20news-18828/rec.sport.baseball',
           10:'/home/blueberry/data/datamining/20news-18828/rec.sport.hockey',
           11:'/home/blueberry/data/datamining/20news-18828/sci.crypt',
           
           12:'/home/blueberry/data/datamining/20news-18828/sci.electronics',
           13:'/home/blueberry/data/datamining/20news-18828/sci.med',
           14:'/home/blueberry/data/datamining/20news-18828/sci.space',
           15:'/home/blueberry/data/datamining/20news-18828/soc.religion.christian',
           
           16:'/home/blueberry/data/datamining/20news-18828/talk.politics.guns',
           17:'/home/blueberry/data/datamining/20news-18828/talk.politics.mideast',
           18:'/home/blueberry/data/datamining/20news-18828/talk.politics.misc',
           19:'/home/blueberry/data/datamining/20news-18828/talk.religion.misc'
          }
save_path = '/home/blueberry/data/datamining/class_with_lable.npy'

In [4]:
def traverse(f,result):
    
    fs = os.listdir(f)
    for f1 in fs:
        tmp_path = os.path.join(f,f1)
        if not os.path.isdir(tmp_path):
            result.append(tmp_path)
        else:
            print(tmp_path)
            traverse(tmp_path,result)

###### 注意这里使用的_pre文件是经过上次实验已经进行过去停用词，符号，获得词干等操作的文件结果

In [5]:
keys_classis = list(classis.keys())
text_with_lable = []
for k in keys_classis:
    cls_path = []
    traverse(classis[k],cls_path)
    for path in cls_path:
        if path[-4:]=='_pre':
            with open(path,'r+') as f :
                lines = f.readlines()
                lines = [line.strip('\n') for line in lines]
                lines.insert(0,k)##类别在当前list的第一个位置
            text_with_lable.append(lines)

In [6]:
text_with_lable[:2]

[[0,
  'liveseysolntzewpdsgicom',
  'jon',
  'livesey',
  'subject',
  'cruel',
  'polit',
  'atheist',
  'articl',
  'aprbmerhbnrca',
  'dgrahambmersbnrca',
  'dougla',
  'graham',
  'write',
  'articl',
  'qnedmafidoasdsgicom',
  'liveseysolntzewpdsgicom',
  'jon',
  'livesey',
  'write',
  'articl',
  'qlmdinngapcaltechedu',
  'keithccocaltechedu',
  'keith',
  'allan',
  'schneider',
  'write',
  'spent',
  'quit',
  'bit',
  'time',
  'word',
  'constitut',
  'realis',
  'wide',
  'held',
  'belief',
  'america',
  'fact',
  'claus',
  'cruel',
  'unusu',
  'punish',
  'like',
  'lot',
  'rest',
  'lift',
  'english',
  'bill',
  'right',
  'accord',
  'jerri',
  'mander',
  'absenc',
  'sacr',
  'good',
  'book',
  'btw',
  'great',
  'bind',
  'law',
  'iroquoi',
  'confederaci',
  'also',
  'play',
  'signific',
  'role',
  'model',
  'us',
  'constitut',
  'furthermor',
  'appar',
  'marx',
  'engel',
  'strong',
  'influenc',
  'studi',
  'iroquoi',
  'societi',
  'use',
  'p

#### 乱序

In [7]:
ran.shuffle(text_with_lable)
text_with_lable[0]

[12,
 'weswsrhpcom',
 'wes',
 'whiteley',
 'subject',
 'solvent',
 'ducttap',
 'adhes',
 'use',
 'product',
 'call',
 'goofoff',
 'come',
 'littl',
 'yellow',
 'size',
 'deck',
 'play',
 'card',
 'work',
 'well',
 'remov',
 'kind',
 'sticker',
 'tape',
 'residu',
 'note',
 'alway',
 'test',
 'small',
 'area',
 'inconspicu',
 'place',
 'befor',
 'use',
 'good',
 'luck',
 'wes',
 'whiteley',
 'weswsrhpcom']

#### 保存
通过npy文件序列化保存

In [8]:
np.save(save_path,text_with_lable)

### 多项式朴素贝叶斯模型

- P(Wi|S) = （有Wi的文档S出现次数 + 1） / (S类文档所有词 + len(W))

In [9]:
vocab_polynomial_list =  [
    {},{},{},{},{}, 
    {},{},{},{},{},
    {},{},{},{},{}, 
    {},{},{},{},{},
              ]##每一类class中的每个词的Prob统计
vocabs_class =  [
    {},{},{},{},{}, 
    {},{},{},{},{},
    {},{},{},{},{}, 
    {},{},{},{},{},
              ]##每个类别中出现的词表统计
vocab_class_len = [] ##每一类中出现的词（一个词出现j次则统计j次）



In [10]:
##上次储存的vocab文件 格式:{(str)wi:(int)出现次数}
vocab = np.load('/home/blueberry/data/datamining/vocab.npy')
vocab = dict(vocab.item())
keys_vocab = list(vocab.keys())

for dicts in vocab_polynomial_list:
    for k in keys_vocab:
        dicts[k.decode(encoding='utf-8')] = 0.0 ##初始每个类别字典
for dicts in vocabs_class:
    for k in keys_vocab:
        dicts[k.decode(encoding='utf-8')] = 0##初始每个类别字典，这让李decode变成字符

##### 统计类别词频

In [11]:
bar = ProgressBar(total = 18828)
for i,k in enumerate(keys_classis):
    cls_path = []
    traverse(classis[k],cls_path)
    l_num = 0
    for path in cls_path:
        if path[-4:]=='_pre':
            with open(path,'r+') as f :
                bar.move()
                if bar.count%100 == 0:
                    bar.log()
                lines = f.readlines()
                lines = [line.strip('\n') for line in lines]
                l_num += len(lines)
                for l in lines:
                    try:
                        vocabs_class[i][l] += 1 ## 统计值自加改变 vocabs_class 中每个字典的值 
                    except Exception:
                        print('exception',l)## 这里防止出错
    vocab_class_len.append(l_num)
bar.log()

-------------------------------Progressing----------------------------------
18828/18828 with 100.% finished: ##############################


#### 所有文档的总词数

In [12]:
sum(vocab_class_len)

2756184

#### 计算Prob
- 加入平滑

In [13]:
import math
bar = ProgressBar(total = len(vocab))
for wi in keys_vocab:##遍历词典中的每一词
    for i,k in enumerate(keys_classis):##需要计算每一个词对于每一个分类的prob<可以理解为一种影响，加入平滑>
        vocab_polynomial_list[i][wi.decode(encoding='utf-8')] = math.log((vocabs_class[i][wi.decode(encoding='utf-8')] + 1)/(vocab_class_len[i]+len(vocab)),2)##以为底的对数
        ##存对数以便之后求概率× => log概率+
    bar.move()
    if bar.count%10000 == 0:
        bar.log()
bar.log()

-------------------------------Progressing----------------------------------
133015/133015 with 100.% finished: ##############################


In [14]:
vocab_polynomial_list[0][keys_vocab[10].decode(encoding='utf-8')]

-8.954523631593336

In [15]:
def sort_dict(vocab,reverse = True):##默认从大到小排序
    v_tuple = sorted(vocab.items(),key = lambda x:x[1],reverse=reverse) ##从大到小排序
    vocab = {}
    for t in v_tuple:
        vocab[t[0]] = t[1]
    return vocab

In [16]:
for voc in vocab_polynomial_list:
    voc = sort_dict(voc)

### 打印每一类 top5词汇

In [17]:
for i,voc in enumerate(vocab_polynomial_list):
    
    top5 = []
    keys_voc = list(voc.keys())
    for j in range(5):
        top5.append((keys_voc[j],voc[keys_voc[j]]))
    print('第',i,'类top5：','\n',top5,'\n')

第 0 类top5： 
 [('subject', -8.167850118172208), ('one', -7.955855145338879), ('use', -9.200622317889138), ('write', -8.062258083141678), ('would', -8.331040285585793)] 

第 1 类top5： 
 [('subject', -7.994443321600229), ('one', -9.057838402888738), ('use', -7.923412082667811), ('write', -8.8804188648995), ('would', -8.91063347794651)] 

第 2 类top5： 
 [('subject', -7.936060847399895), ('one', -9.123764845641242), ('use', -7.772463127498815), ('write', -8.738228079434506), ('would', -9.252456068618429)] 

第 3 类top5： 
 [('subject', -7.836296965380013), ('one', -8.634173141549553), ('use', -7.854729683476998), ('write', -8.972110820725733), ('would', -8.83348194977296)] 

第 4 类top5： 
 [('subject', -7.806682252514311), ('one', -8.725025032945178), ('use', -8.126439248360047), ('write', -8.82124034820448), ('would', -8.85080480574662)] 

第 5 类top5： 
 [('subject', -7.830200028615227), ('one', -9.071000291910222), ('use', -7.380989237372828), ('write', -9.065607033140003), ('would', -9.308328983551

### 存储所得的类向量表存储到文件中

In [18]:
np.save(open('vocab_polynomial_list.bin','wb'),vocab_polynomial_list)

### 模型测试
- test 数据理论上不应在 train set中

In [19]:
test = text_with_lable[:1000] ##取100个已经shuffle的文件作为test set
X = [t[1:]  for t in test] ##data
Y = [t[0] for t in test]##lables
result = [] ##判断对计为1 否则计为0

#### 分类过程
- 其实就是简单的查表

In [20]:
predict = []
for i,x in enumerate(X):
    prob = [0,0,0,0,0,
           0,0,0,0,0,
           0,0,0,0,0,
           0,0,0,0,0,]
    for j in range(len(classis)):
        for l in x:##对每个词查表求和
            prob[j] += vocab_polynomial_list[j][l]
    predict.append(np.argmax(prob))##找最大的位置作为predict,


In [21]:
for i in range(len(X)):
    result.append(1 if predict[i] == Y[i] else 0)##判断predict==lable?

In [22]:
print('acc  =>  ',sum(result)/len(result)*100,'%')

acc  =>   95.8 %


### 伯努力朴素贝叶斯模型

- P(Wi|S) = （有Wi的文档S数 + 1） / (S类文档出现的词（一个词一个文档只计算一次） + 2)

In [23]:
vocab_bernoulli_list =  [
    {},{},{},{},{}, 
    {},{},{},{},{},
    {},{},{},{},{}, 
    {},{},{},{},{},
              ]##每一类class中的每个词的Prob统计
vocabs_class =  [
    {},{},{},{},{}, 
    {},{},{},{},{},
    {},{},{},{},{}, 
    {},{},{},{},{},
              ]##每个类别中出现的词表统计
vocab_class_len = [] ##每一类中出现的词（一个词在一个文档中至多计算一次）

### 初始化同上

In [24]:
for dicts in vocab_bernoulli_list:
    for k in keys_vocab:
        dicts[k.decode(encoding='utf-8')] = 0.0 ##初始每个类别字典
for dicts in vocabs_class:
    for k in keys_vocab:
        dicts[k.decode(encoding='utf-8')] = 0##初始每个类别字典，这让李decode变成字符

In [25]:
bar = ProgressBar(total = 18828)
for i,k in enumerate(keys_classis):
    cls_path = []
    traverse(classis[k],cls_path)
    l_num = 0
    for path in cls_path:
        if path[-4:]=='_pre':
            with open(path,'r+') as f :
                bar.move()
                if bar.count%100 == 0:
                    bar.log()
                lines = f.readlines()
                lines = [line.strip('\n') for line in lines]
                lines = set(lines)##对同一个document进行去重，set中的元素不能重复
                l_num += len(lines)##这里统计的值也不会统计重复的
                for l in lines:
                    try:
                        vocabs_class[i][l] += 1 ## 统计值自加改变 vocabs_class 中每个字典的值 
                    except Exception:
                        print('exception',l)## 这里防止出错
    vocab_class_len.append(l_num)
bar.log()

-------------------------------Progressing----------------------------------
18828/18828 with 100.% finished: ##############################


#### 所有文档的总词数
- 明显比之前的 200w+ 少了很多

In [26]:
sum(vocab_class_len)

1834133

In [27]:
import math
bar = ProgressBar(total = len(vocab))
for wi in keys_vocab:##遍历词典中的每一词
    for i,k in enumerate(keys_classis):##需要计算每一个词对于每一个分类的prob<可以理解为一种影响，加入平滑>
        vocab_bernoulli_list[i][wi.decode(encoding='utf-8')] = math.log((vocabs_class[i][wi.decode(encoding='utf-8')] + 1)/(vocab_class_len[i]+2),2)##以为底的对数
        ##存对数以便之后求概率× => log概率+
    bar.move()
    if bar.count%10000 == 0:
        bar.log()
bar.log()

-------------------------------Progressing----------------------------------
133015/133015 with 100.% finished: ##############################


In [28]:
vocab_bernoulli_list[0][keys_vocab[10].decode(encoding='utf-8')]

-8.290813885635139

In [29]:
for voc in vocab_bernoulli_list:
    voc = sort_dict(voc)

### 打印top5

In [30]:
for i,voc in enumerate(vocab_bernoulli_list):
    
    top5 = []
    keys_voc = list(voc.keys())
    for j in range(5):
        top5.append((keys_voc[j],voc[keys_voc[j]]))
    print('第',i,'类top5：','\n',top5,'\n')

第 0 类top5： 
 [('subject', -6.7967048153650955), ('one', -7.740121286998728), ('use', -8.685673502976352), ('write', -7.107405654829204), ('would', -7.97703663186864)] 

第 1 类top5： 
 [('subject', -6.382106898559566), ('one', -8.281978864072023), ('use', -7.740029252310961), ('write', -7.721170225059645), ('would', -7.947941086906666)] 

第 2 类top5： 
 [('subject', -6.4943383597192685), ('one', -8.269857194654868), ('use', -7.4972676907579405), ('write', -7.701689936476691), ('would', -8.389933646646618)] 

第 3 类top5： 
 [('subject', -6.135601002453696), ('one', -7.6757691725120925), ('use', -7.218667613666705), ('write', -7.613124235523097), ('would', -7.6972702417230145)] 

第 4 类top5： 
 [('subject', -6.033491687673114), ('one', -7.585832766825072), ('use', -7.439559033447404), ('write', -7.423748518599942), ('would', -7.625972157678287)] 

第 5 类top5： 
 [('subject', -6.4621363908841865), ('one', -8.457731211764186), ('use', -7.420106139464268), ('write', -8.038301943368184), ('would', -8.3

#### 模型测试
- 过程同上

In [31]:
result = [] ##判断对计为1 否则计为0
predict = []
for i,x in enumerate(X):
    prob = [0,0,0,0,0,
           0,0,0,0,0,
           0,0,0,0,0,
           0,0,0,0,0,]
    for j in range(len(classis)):
        for l in x:##对每个词查表求和
            prob[j] += vocab_bernoulli_list[j][l]
    predict.append(np.argmax(prob))##找最大的位置作为predict,
for i in range(len(X)):
    result.append(1 if predict[i] == Y[i] else 0)##判断predict==lable?

In [32]:
print('acc  =>  ',sum(result)/len(result)*100,'%')

acc  =>   94.6 %
