In [3]:
import os, sys
import re
import jieba
import numpy as np

In [4]:
data_file = '../dataset/email'

In [5]:
test_data = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
             ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
             ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
             ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
             ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
             ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

test_data_class = [0, 1, 0, 1, 0, 1] # 1 - negative, 0 - postive

In [6]:
def buildVocabList(data):
    vocab_list = []
    for example in data:
        vocab_list.extend(example)
    return list(set(vocab_list))

In [7]:
vocab_list = buildVocabList(test_data)
print(vocab_list)

['I', 'garbage', 'problems', 'please', 'food', 'has', 'park', 'not', 'cute', 'ate', 'stupid', 'quit', 'love', 'to', 'licks', 'mr', 'my', 'buying', 'worthless', 'dalmation', 'flea', 'how', 'dog', 'stop', 'take', 'so', 'help', 'maybe', 'steak', 'him', 'posting', 'is']


In [8]:
def word2Onehot(vocab_list, sentence):
    res = np.zeros((len(vocab_list)))
    for word in sentence:
        res[vocab_list.index(word)] = 1
    return res

In [9]:
test_vec = np.zeros((len(test_data), len(vocab_list)))
for i in range(len(test_data)):
    test_vec[i] = word2Onehot(vocab_list, test_data[i])
print(test_vec)

[[0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0.
  0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  1. 0. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1.
  0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]]


$p(c|\mathbf{w})=\frac{p(\mathbf{w}|c)p(c)}{p(\mathbf{w})}$

In [10]:
def naiveBayesTrainer(data_mat, data_class):
    word_num = len(data_mat[0])
    data_num = len(data_mat)
    post_prob = np.sum(data_class)/float(6) #p(c_1) = p(1)
    # nega_prob = 1 - post_prob # p(c_0) = p(0)
    post_class_word = np.ones(word_num)
    nega_class_word = np.ones(word_num)
    post_num = 1.0
    nega_num = 1.0
    for i in range(data_num):
        # print(i,"-th data, class is ", test_data_class[i])
        if data_class[i] == 0: # negative
            nega_class_word += data_mat[i] # frequence of word in 0-class 
            # print(nega_class_word)
            nega_num += sum(data_mat[i]) # num of word in 0-class
            # print(nega_num)
        else: # postive
            post_class_word += data_mat[i]
            # print(post_class_word)
            post_num += sum(data_mat[i])
            # print(post_num)
    post_vec = np.log(post_class_word/post_num) # p(w|1)
    nega_vec = np.log(nega_class_word/nega_num) # p(w|0)
    return post_vec, nega_vec, post_prob

In [11]:
post_vec, nega_vec, post_prob = naiveBayesTrainer(test_vec, test_data_class)
print(post_vec)
print(nega_vec)
print(post_prob)

[-2.99573227 -2.30258509 -2.99573227 -2.99573227 -2.30258509 -2.99573227
 -2.30258509 -2.30258509 -2.99573227 -2.99573227 -1.60943791 -2.30258509
 -2.99573227 -2.30258509 -2.99573227 -2.99573227 -2.99573227 -2.30258509
 -1.89711998 -2.99573227 -2.99573227 -2.99573227 -1.89711998 -2.30258509
 -2.30258509 -2.99573227 -2.99573227 -2.30258509 -2.99573227 -2.30258509
 -2.30258509 -2.99573227]
[-2.52572864 -3.21887582 -2.52572864 -2.52572864 -3.21887582 -2.52572864
 -3.21887582 -3.21887582 -2.52572864 -2.52572864 -3.21887582 -3.21887582
 -2.52572864 -2.52572864 -2.52572864 -2.52572864 -1.83258146 -3.21887582
 -3.21887582 -2.52572864 -2.52572864 -2.52572864 -2.52572864 -2.52572864
 -3.21887582 -2.52572864 -2.52572864 -3.21887582 -2.52572864 -2.12026354
 -3.21887582 -2.52572864]
0.5


In [12]:
sample = ['love', 'my', 'dalmation']
vec = word2Onehot(vocab_list, sample)
print(vec)
print(vec*post_vec)
p1 = np.sum(vec * post_vec) + np.log(post_prob)
p0 = np.sum(vec * nega_vec) + np.log(1-post_prob)
print(p1)
print(p0)
if p1 >= p0:
    print(1)
else:
    print(0)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
[-0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -2.99573227 -0.         -0.         -0.         -2.99573227 -0.
 -0.         -2.99573227 -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.        ]
-9.680344001221918
-7.577185932924767
0


In [13]:
def naiveBayesClassifier(input_vec, post_vec, nega_vec, post_prob):
    p1 = np.sum(input_vec * post_vec) + np.log(post_prob)
    p0 = np.sum(input_vec * nega_vec) + np.log(1-post_prob)
    if p1 > p0: return 1
    else: return 0

In [14]:
print(naiveBayesClassifier(vec, post_vec, nega_vec, post_prob))

0
