# Naive Bayes Classifier (NBC)

In [9]:
import numpy as np
import re

## 1. Probability Review
### 1.1 Conditional Probability
$$p(c|x) = \frac{p(c,x)}{p(x)}$$
### 1.2 Bayes' Theorem
$$p(c|x) = \frac{p(x|c)p(c)}{p(x)}$$
### 1.3 Principle of Classification
Denote $c_1$ as class 1, $c_2$ as class 2, $x$, $y$ are two independent features, if we have:
$$\begin{align*}
p(c_1|x,y)&>p(c_2|x,y)\\
\frac{p(x,y|c_1)p(c_1)}{p(x,y)}&>\frac{p(x,y|c_2)p(c_2)}{p(x,y)}\text{,}
\end{align*}$$
then we say the subject is more likely to be a member of $c_1$.

## 2. Text Classification
### 2.1 Text Processing

In [2]:
def create_dataset():
    postings=[['my','dog','has','flea','problems','help','please'],\
             ['maybe','not','take','him','to','dog','park','stupid'],\
             ['my','dalmation','is','so','cute','I','love','him'],\
             ['stop','posting','stupid','worthless','garbage'],\
             ['mr','licks','ate','my','steaks','how','to','stop','him'],\
             ['quit','buying','worthless','dog','food','stupid']]
    labels = [0,1,0,1,0,1] #1 is insulting words, 0 is not
    return postings, labels

def create_vocab_list(dataset):
    vocab_list = set([])
    for record in dataset:
        vocab_list = vocab_list|set(record)
    return list(vocab_list)

def record_to_vector(record, vocab_list):
    vector = [0]*len(vocab_list)
    for word in record:
        if word in vocab_list:
            vector[vocab_list.index(word)] += 1 
            #each time of apperance is recorded, this model is called as bag-of-words-model
            #another model is the set-of-words-model, which only cares about if one word appears or not
            #vector[vocab_list.index(word)] = 1 
        else:
            print('The word %s is not in the vocabulary list.'%str(word))
    return vector

In [3]:
postings, labels = create_dataset()
v_list = create_vocab_list(postings)
print(v_list)
vector_0 = record_to_vector(postings[0], v_list)
print(vector_0)

['stop', 'flea', 'maybe', 'ate', 'licks', 'help', 'quit', 'I', 'him', 'love', 'how', 'cute', 'has', 'not', 'stupid', 'my', 'garbage', 'food', 'please', 'posting', 'so', 'worthless', 'park', 'take', 'mr', 'to', 'dog', 'buying', 'is', 'dalmation', 'steaks', 'problems']
[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]


### 2.2 Training
Denote $w$ as a vector of a record, then we use:
$$p(c_i|w)=\frac{p(w|c_i)p(c_i)}{p(w)}$$
to calculate the probability of belonging to class $i$.  
If all features, *i.e.* $w_0,w_1,w_2,...$ are all indepent of each other, then we have:
$$\begin{align*}
p(w|c_i) &= p(w_0,w_1,w_2,\cdots|c_i)\\
&= p(w_0|c_i)p(w_1|c_i)p(w_2|c_i)\cdots p(w_N|c_i)
\end{align*}$$
It is obvious that the denominator $p(w)$ is always a constant, so that it could be omitted during classification.

In [4]:
def nbc(vectorset, labels):
    #vectorset here is collection of vectors generated by function record_to_vector
    m = len(vectorset) #number of records
    n = len(vectorset[0]) #number of features
    p_class1 = np.sum(labels)/len(labels) #p of a record being an insult one
    nump0 = np.zeros(n)
    nump1 = np.zeros(n)
    denomp0 = 0
    denomp1 = 0
    for i in range(m):
        if labels[i] == 1: #if this is an insulting one
            nump1 += vectorset[i] #numpy calculation, element-wise
            denomp1 += sum(vectorset[i]) #+= number of words in this record
        else:
            nump0 += vectorset[i]
            denomp0 += sum(vectorset[i])
    p1_vector = nump1/denomp1 #p of each word in class 1
    p0_vector = nump0/denomp0 #p of each word in class 2
    print(denomp0,denomp1)
    return p0_vector,p1_vector,p_class1

In [5]:
postings, labels = create_dataset()
v_list = create_vocab_list(postings)
vectorset = []
for record in postings:
    vectorset.append(record_to_vector(record, v_list))
vectorset = np.array(vectorset)
p0_vector,p1_vector,p_class1 = nbc(vectorset, labels)
print('Probability of a record to be insulting: ', p_class1)
print('p0 vector: ', p0_vector)
print('p1 vector: ', p1_vector)

24 19
Probability of a record to be insulting:  0.5
p0 vector:  [0.04166667 0.04166667 0.         0.04166667 0.04166667 0.04166667
 0.         0.04166667 0.08333333 0.04166667 0.04166667 0.04166667
 0.04166667 0.         0.         0.125      0.         0.
 0.04166667 0.         0.04166667 0.         0.         0.
 0.04166667 0.04166667 0.04166667 0.         0.04166667 0.04166667
 0.04166667 0.04166667]
p1 vector:  [0.05263158 0.         0.05263158 0.         0.         0.
 0.05263158 0.         0.05263158 0.         0.         0.
 0.         0.05263158 0.15789474 0.         0.05263158 0.05263158
 0.         0.05263158 0.         0.10526316 0.05263158 0.05263158
 0.         0.05263158 0.10526316 0.05263158 0.         0.
 0.         0.        ]


### 2.3 Modification
When calculating $p(w|c_i) = p(w_0|c_i)p(w_1|c_i)p(w_2|c_i)\cdots p(w_N|c_i)$, if any item equals to zero, then the result is also zero. To avoid this, during initialization, we set the appearance of each word as 1, and set denominator as 2.  
Another problem is that if too mucn items are with small values, then the result will again becomes zero because of truncation. The solution is to use logrithm.

In [6]:
def modified_nbc(vectorset, labels):
    m = len(vectorset) #number of records
    n = len(vectorset[0]) #number of features
    p_class1 = np.sum(labels)/len(labels) #p of a record being an insult one
    nump0 = np.ones(n)
    nump1 = np.ones(n)
    denomp0 = 2
    denomp1 = 2
    for i in range(m):
        if labels[i] == 1: #if this is an insulting one
            nump1 = nump1 + vectorset[i] #numpy calculation, element-wise
            denomp1 = denomp1 + sum(vectorset[i]) #+= number of words in this record
        else:
            nump0 = nump0 + vectorset[i]
            denomp0 = denomp0 + sum(vectorset[i])
    p1_vector = np.log(nump1/denomp1) #p of each word in class 1
    p0_vector = np.log(nump0/denomp0) #p of each word in class 2
    return p0_vector,p1_vector,p_class1

In [7]:
def classify(record_vector, p0_vector,p1_vector, p_class1):
    p1 = np.sum(record_vector*p1_vector) + np.log(p_class1) #ln(a*b) = ln(a) + ln(b)
    p0 = np.sum(record_vector*p0_vector) + np.log(1-p_class1)
    if p1 > p0:
        return 1
    else:
        return 0
    
def test_nbc(subject, dataset, labels):
    vector_set = []
    v_list = create_vocab_list(dataset)
    for record in dataset:
        vector = record_to_vector(record, v_list)
        vector_set.append(vector)
    vector_set = np.array(vector_set)
    p0_vector, p1_vector, p_class1 = modified_nbc(vectorset, labels)
    subject_vector = record_to_vector(subject, v_list)
    label = classify(subject_vector, p0_vector, p1_vector, p_class1)
    return label

In [8]:
postings, labels = create_dataset()
subj1 = 'love my dalmation'.split()
subj2 = 'stupid garbage'.split()
print('Class of ' + str(subj1) + ' is: ', test_nbc(subj1, postings, labels))
print('Class of ' + str(subj2) + ' is: ', test_nbc(subj2, postings, labels))

Class of ['love', 'my', 'dalmation'] is:  0
Class of ['stupid', 'garbage'] is:  1


## 3. Spam Filtering