# Naive Bayes

### Model
#### $P(y|X)=\frac{P(X|y)P(y)}{P(X)}$

if you have a lot of variables: <br>
#### $P(y|x_1, ..., x_n)=\frac{P(x_1|y)P(x_2|y)...P(x_n|y)P(y)}{P(x_1)P(x_2)...P(x_n)}$

__*!!! since the denominator is the same for both probabilities, it can be omitted from the calculation, and only need to consider the numerator.*__
###### $P(y|x_1, ..., x_n)=P(x_1|y)P(x_2|y)...P(x_n|y)P(y)$ 
<br>
Naive Bayes is to use variable X to classify target y based on comparasion of probability of being target 1, target 2, target n

### Assumption
1. Features are independent to each other. 
2. Every feature is equally important.

**Step**
1. separate the dataset by target, you need to have a value count table for each class
2. calculate the probability of each target: $P(Y)$ = number of Y / total number of entities
3. for loop each target group:
    * sum up the frequency for each unique word
    * calculate the probability of each word in the target group, **remark: it is $P(X|y)$, conditional prob of x given by target y**
    
Now, we have $P(Y)$, $P(X|y)$ and P(X)<br>
**Input new data** <br>
for loop each target group:
1. multiple the conditional probability for the input data: $P(y|x_1, ..., x_n)=\frac{P(x_1|y)P(x_2|y)...P(x_n|y)P(y)}{P(x_1)P(x_2)...P(x_n)}$
2. compare probability (likelihood) and assign the target with highest prob to the input data

In [1]:
import pandas as pd
import numpy as np
import os
import re
from collections import Counter

In [2]:
ham_word_list = []
spam_word_list = []
for i in range(1, 26):
    ham_file = open(f'data/email/ham/{i}.txt', 'r').read()
    ham_words = list(filter(None, re.split(r"\W+", ham_file)))
    ham_words = [j.lower() for j in ham_words if len(j) > 2]
    ham_word_list.extend(ham_words)
    
    spam_file = open(f'data/email/spam/{i}.txt', 'r').read()
    spam_words = list(filter(None, re.split(r"\W+", spam_file)))
    spam_words = [j.lower() for j in spam_words if len(j) > 2]
    spam_word_list.extend(spam_words)

In [3]:
ham_word_list = []
spam_word_list = []
ham_file = open(f'data/email/ham/full.txt', 'r', encoding='utf-8').read()
ham_words = list(filter(None, re.split(r"\W+", ham_file)))
ham_words = [j.lower() for j in ham_words if len(j) > 2]
ham_word_list.extend(ham_words)

spam_file = open(f'data/email/spam/spam_full.txt', 'r', encoding='utf-8').read()
spam_words = list(filter(None, re.split(r"\W+", spam_file)))
spam_words = [j.lower() for j in spam_words if len(j) > 2]
spam_word_list.extend(spam_words)

In [4]:
value_count = Counter(ham_word_list)
ham_df = pd.DataFrame.from_dict(value_count, orient='index', columns=['count']).reset_index()
ham_df = ham_df.rename(columns={'index': 'word'})

value_count = Counter(spam_word_list)
spam_df = pd.DataFrame.from_dict(value_count, orient='index', columns=['count']).reset_index()
spam_df = spam_df.rename(columns={'index': 'word'})

ham_df['CP'] = ham_df['count'] / sum(ham_df['count'])
spam_df['CP'] = spam_df['count'] / sum(spam_df['count'])

# --------------------------

In [144]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

# create a unique word list
def create_vocab_list(data_set):
    vocab_set = set()  # create empty set
    for document in data_set:
        vocab_set = vocab_set | set(document)  # union of the two sets
    return list(vocab_set)

In [162]:
def Conditional_Prob(create_vocab_list, data_set, class_set):
    return_vec_0 = np.ones(len(create_vocab_list))
    return_vec_1 = np.ones(len(create_vocab_list))
    for num in range(len(class_set)):
        if class_set[num] == 0:
            for word in data_set[num]:
                return_vec_0[create_vocab_list.index(word)] += 1
        if class_set[num] == 1: 
            for word in data_set[num]:
                return_vec_1[create_vocab_list.index(word)] += 1
    P_0 = Counter(class_set)[0] / len(class_set)
    P_1 = 1 - P_0
    CP_0 = return_vec_0 / np.sum(return_vec_0)
    CP_1 = return_vec_1 / np.sum(return_vec_1)
    return P_0, P_1, CP_0, CP_1

In [164]:
data_set, class_set = loadDataSet()
vocab_list = create_vocab_list(data_set)
word_count = len(vocab_list)
P_0, P_1, CP_0, CP_1 = Conditional_Prob(vocab_list, data_set, class_set)

In [206]:
def Classify(vocab_list, P_0, P_1, CP_0, CP_1, test):
    test_index = [vocab_list.index(i) for i in test if i in vocab_list]
    p0 = np.sum(np.log(CP_0[test_index]))*P_0
    p1 = np.sum(np.log(CP_1[test_index]))*P_1
    if p0 > p1:
        return '0'
    else:
        return '1'

In [208]:
data = []
root_dir = "data/email"
for sub_folder in os.listdir(root_dir):
    for txt_file in os.listdir(f'{root_dir}/{sub_folder}'):
        with open(f'{root_dir}/{sub_folder}/{txt_file}', encoding="latin-1") as f:
            file_content = f.read()

In [None]:
data = []
root_dir = 'data/email'
for 

In [209]:
import os

root_dir = "data/email"

for folder in os.listdir(root_dir):
    if os.path.isdir(os.path.join(root_dir, folder)):
        # This is a subdirectory
        for file in os.listdir(os.path.join(root_dir, folder)):
            if file.endswith(".txt"):
                # This is a text file
                file_path = os.path.join(root_dir, folder, file)
                # Do something with the file path

['Hi Peter,\n\nWith Jose out of town, do you want to\nmeet once in a while to keep things\ngoing and do some interesting stuff?\n\nLet me know\nEugene',
 "Yay to you both doing fine!\n\nI'm working on an MBA in Design Strategy at CCA (top art school.)  It's a new program focusing on more of a right-brained creative and strategic approach to management.  I'm an 1/8 of the way done today!",
 "WHat is going on there?\nI talked to John on email.  We talked about some computer stuff that's it.\n\nI went bike riding in the rain, it was not that cold.\n\nWe went to the museum in SF yesterday it was $3 to get in and they had\nfree food.  At the same time was a SF Giants game, when we got done we\nhad to take the train with all the Giants fans, they are 1/2 drunk.",
 "Yo.  I've been working on my running website.  I'm using jquery and the jqplot plugin.  I'm not too far away from having a prototype to launch.  \n\nYou used jqplot right?  If not, I think you would like it.",
 'There was a guy at

In [213]:
root_dir = "data/email"
for sub_folder in os.listdir(root_dir):
    for i in os.listdir(f'{root_dir}/{sub_folder}'):
        print(i)

1.txt
10.txt
11.txt
12.txt
13.txt
14.txt
15.txt
16.txt
17.txt
18.txt
19.txt
2.txt
20.txt
21.txt
22.txt
23.txt
24.txt
25.txt
3.txt
4.txt
5.txt
6.txt
7.txt
8.txt
9.txt
full.txt
1.txt
10.txt
11.txt
12.txt
13.txt
14.txt
15.txt
16.txt
17.txt
18.txt
19.txt
2.txt
20.txt
21.txt
22.txt
23.txt
24.txt
25.txt
3.txt
4.txt
5.txt
6.txt
7.txt
8.txt
9.txt
spam_full.txt


In [195]:
import random
import numpy as np


def load_data(folder):
    data = []
    
    for i in range(1, 26):
        with open(f"{folder}/{i}.txt", encoding="latin-1") as f:
            words = text_parse(f.read())
            data.append(words)
            
    return data


def split_data(data, train_size=0.8):
    train_set = []
    test_set = []
    train_classes = []
    test_classes = []
    n_train = int(len(data) * train_size)
    
    for i, doc in enumerate(data):
        if i < n_train:
            train_set.append(doc)
            train_classes.append(1 if i < n_train/2 else 0)
        else:
            test_set.append(doc)
            test_classes.append(1 if i < (n_train+len(data))/2 else 0)
            
    return train_set, train_classes, test_set, test_classes


def train_and_evaluate(train_set, train_classes, test_set, test_classes):
    vocab_list = create_vocab_list(train_set)
    train_mat = [bag_of_words_2_vec_mn(vocab_list, doc) for doc in train_set]
    train_classes = np.array(train_classes)
    test_mat = [bag_of_words_2_vec_mn(vocab_list, doc) for doc in test_set]
    
    p0v, p1v, p_spam = train_nb0(np.array(train_mat), train_classes)
    
    error_count = sum(classify_nb(np.array(test_mat), p0v, p1v, p_spam) != np.array(test_classes))
    error_rate = error_count/len(test_set)
    
    print(f"The error rate is: {error_rate:.2%}")
    
    return vocab_list


def spam_test():
    spam_data = load_data("email/spam")
    ham_data = load_data("email/ham")
    data = spam_data + ham_data
    train_set, train_classes, test_set, test_classes = split_data(data)
    vocab_list = train_and_evaluate(train_set, train_classes, test_set, test_classes)
    
    return vocab_list

'1'