## 情感分析
* 加载数据
* 数据处理
* 使用朴素贝叶斯进行情感分类

### 1 加载数据

In [1]:
import os
path = './data/aclImdb/train/'
pos_path = path + 'pos/'
neg_path = path + 'neg/'
pos_files = [pos_path + x for x in 
			 filter(lambda x: x.endswith('.txt'), os.listdir(pos_path))]
neg_files = [neg_path + x for x in 
			 filter(lambda x: x.endswith('.txt'), os.listdir(neg_path))]
pos_list = [open(x, 'r', encoding='utf-8').read().lower() for x in pos_files]
neg_list = [open(x, 'r', encoding='utf-8').read().lower() for x in neg_files]
data_list = pos_list + neg_list
labels_list = [1] * len(pos_list) + [0] * len(neg_list)
# shuffle if you'd like ===========================
from random import shuffle
merged_data = list(zip(data_list, labels_list))
shuffle(merged_data)
data_list, label_list = list(zip(*merged_data))

In [2]:
data_list[:2]

("after a day at work, i sat down to relax and turned on the movie channels. the movie came up on the guide and sounded interesting so i tuned in just before it started. the first 30 minutes were enough to make me interested, but the lack of acting ability in jamie foxx and the slow plot movement made me want to get up and find food during the movie. if there is any credit to be given for acting in this movie it should go to david morse who at least tries to make the movie interesting. all in all, don't plan on impressing your friends by picking this one as a renter for a movie night.",
 "i saw the big bad swim at the 2006 temecula film festival, and was totally caught off guard by how much i was drawn into it.<br /><br />the film centers around the lives of a group of people taking an adult swim class for various reasons. a humorous idea in its own right, the class serves as a catalyst for greater changes in the students' lives.<br /><br />what surprised me about the film was how real

In [3]:
set(label_list)

{0, 1}

In [4]:
(len(data_list),len(label_list))

(25000, 25000)

### 2 数据处理

In [5]:
from keras.preprocessing.text import Tokenizer
max_vocab_size = 50000
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(data_list)
# tokenizer.texts_to_matrix() input a list of raw text, return a numpy matrix
# for one text to test
tf_idf_data_121 = tokenizer.texts_to_matrix([data_list[121]], mode='tfidf') 
# for total text
tf_idf_data = tokenizer.texts_to_matrix(data_list, mode='tfidf')
# ===================================================
# Implement this by yourself without this Tokenizer, it's also really easy. ^_^
# Well you can just use it, enjoy!

Using TensorFlow backend.


In [6]:
(len(tf_idf_data),tf_idf_data[0].shape)

(25000, (50000,))

In [7]:
tf_idf_data[0]

array([0.        , 2.14727243, 1.69492924, ..., 0.        , 0.        ,
       0.        ])

In [8]:
import numpy as np
label_list = np.array(label_list)

### 3 构建朴素贝叶斯模型
* 与传统朴素贝叶斯不同 使用tf-idf 所以不分传统的混合模型，伯努利模型，多项式模型
* 计算每一类概率P(Ci|w)分母相同，只计算分子
* 使用条件独立假设建模分子中的 P(w|Ci) = P(w1|Ci)*... P(wn|Ci) 所以要建模 每一类的 prob log(P(wn|Ci))

In [9]:
import math
class Model(object):
    ## 朴素贝叶斯
    def train(self,train_matrix,train_label):
        
        vector_len = len(train_matrix[0])
        class_number = len(set(train_label))
        item_number = len(train_matrix)
        
        self.vector_len = vector_len
        self.class_number = class_number
        self.item_number = item_number
        
        PC = {}
        
        # 先验概率
        
        for i in range(class_number):
            PC[i] = 0.0
        for label in train_label:
            PC[label] += 1
        
        for i in range(class_number):
            PC[i] = (PC[i]+1)/(item_number+class_number)
        
        print(PC)
        
        PWC = []
        ## 每个类的词表概率
        
        vocb_ones_vector = np.ones(vector_len,)  #[vocab_size,]
        
        for n in range(class_number):
            VCn = np.where(train_label == n) ## 加速            
            VCn_vector = (np.sum(train_matrix[VCn],0) +1)/(len(VCn[0])+class_number) ## 加入平滑
            PWC.append(np.log(VCn_vector))
        
        self.PWC = PWC
        self.PC = PC
        
        print("Fit Successfully!")
        
    def predict(self,vector):
        ## 使用tf_idf 加权
        PCW = []
        ## 文档概率
        for n in range(self.class_number):
            PCW.append(self.PC[n]*np.sum(vector * self.PWC[n])) ## 使用向量对应乘而不是点乘加权
        
        return np.argmax(PCW) ## 分类结果
    
        
        

In [28]:
model = Model()
model.train(tf_idf_data[:-1000],label_list[:-1000])

{0: 0.4995000416631947, 1: 0.5004999583368053}
Fit Successfully!


### 4 测试


#### train_acc

In [29]:
predict_label = []
for v in tf_idf_data[:-1000]:
    predict_label.append(model.predict(v))
predict_label = np.array(predict_label)
len(np.where(predict_label == label_list[:-1000])[0])/len(label_list[:-1000])

0.9450416666666667

#### test_acc

In [30]:
predict_label = []
for v in tf_idf_data[-1000:]:
    predict_label.append(model.predict(v))
predict_label = np.array(predict_label)
len(np.where(predict_label == label_list[-1000:])[0])/len(label_list[-1000:])

0.839